List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass)
From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. */// w ww.j a v a 2 s.co m protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create dated dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(DatedCorpusGenMapper.class); job.setReducerClass(DatedCorpusGenReducer.class); job.setMapOutputValueClass(PageVersion.class); job.setOutputFormat(DatedLangFilesOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.extract.NutchPageExtractor.java
License:Apache License
/** * Configures the extraction job.//from w w w . j a v a2s. c o m */ protected JobConf createJobConf(String crawlDir) throws IOException { Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR); List<Path> segPaths = allSegmentDirs(segmentsPath); StringBuilder allSegNames = new StringBuilder(); for (int i = 0; i < segPaths.size(); i++) { allSegNames.append(" " + segPaths.get(i).getName()); } String timeStamp = getCurTimeStamp(); JobConf job = new NutchJob(getConf()); job.setJobName("read segments" + allSegNames.toString()); // Specify what info to extract job.setBoolean("segment.reader.co", m_co); job.setBoolean("segment.reader.fe", m_fe); job.setBoolean("segment.reader.ge", m_ge); job.setBoolean("segment.reader.pa", m_pa); job.setBoolean("segment.reader.pd", m_pd); job.setBoolean("segment.reader.pt", m_pt); // Specify the paths to extract from for each segment for (int i = 0; i < segPaths.size(); i++) { if (m_ge) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME)); if (m_fe) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME)); if (m_pa) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME)); if (m_co) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME)); if (m_pd) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME)); if (m_pt) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME)); } // Specify the segments directory so that mapper can recover segment info job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName()); // Store the start time/date of this job job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(PageExtMapper.class); job.setReducerClass(PageExtReducer.class); job.setMapOutputValueClass(NutchChunk.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:Brush.AdjustMateEdge.java
License:Apache License
public RunningJob run(String inputPath, String outputPath, long reads, long ctg_sum) throws Exception { sLogger.info("Tool name: AdjustMateEdge"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); //JobConf conf = new JobConf(Stats.class); JobConf conf = new JobConf(AdjustMateEdge.class); conf.setJobName("AdjustMateEdge " + inputPath); conf.setLong("READS", reads); conf.setLong("CTG_SUM", ctg_sum); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(AdjustMateEdgeMapper.class); conf.setReducerClass(AdjustMateEdgeReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.Compressible.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: Compressible"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); //JobConf conf = new JobConf(Stats.class); JobConf conf = new JobConf(Compressible.class); conf.setJobName("Compressible " + inputPath); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CompressibleMapper.class); conf.setReducerClass(CompressibleReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.CountBraid.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: CountBraid"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(CountBraid.class); conf.setJobName("CountBraid " + inputPath + " " + BrushConfig.K); //conf.setFloat("Error_Rate", ErrorRate); //conf.setFloat("Exp_Cov", Exp_Cov); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CountBraidMapper.class); conf.setReducerClass(CountBraidReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.CountKmer.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: CountKmer"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(CountKmer.class); conf.setJobName("CountKmer " + inputPath + " " + BrushConfig.K); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); //conf.setMapOutputValueClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); //conf.setBoolean("mapred.output.compress", true); conf.setMapperClass(CountKmerMapper.class); conf.setReducerClass(CountKmerReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.CountReads.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: CountReads"); sLogger.info(" - input: " + inputPath); JobConf conf = new JobConf(CountReads.class); conf.setJobName("CountReads " + inputPath + " " + BrushConfig.K); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CountReadsMapper.class); //conf.setReducerClass(CountReadsReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.CutChimericLinks.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: CutChimericLinks"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(CutChimericLinks.class); conf.setJobName("CutChimericLinks " + inputPath + " " + BrushConfig.K); //conf.setFloat("Error_Rate", ErrorRate); //conf.setFloat("Exp_Cov", Exp_Cov); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CutChimericLinksMapper.class); conf.setReducerClass(CutChimericLinksReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.CutRepeatBoundary.java
License:Apache License
public RunningJob run(String inputPath, String outputPath, long reads, long ctg_sum) throws Exception { sLogger.info("Tool name: CutRepeatBoundary"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(CutRepeatBoundary.class); conf.setJobName("CutRepeatBoundary " + inputPath + " " + BrushConfig.K); //conf.setFloat("Error_Rate", ErrorRate); //conf.setFloat("Exp_Cov", Exp_Cov); conf.setLong("READS", reads); conf.setLong("CTG_SUM", ctg_sum); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CutRepeatBoundaryMapper.class); conf.setReducerClass(CutRepeatBoundaryReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Brush.DefineConsensus.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: DefineConsensus"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(DefineConsensus.class); conf.setJobName("DefineConsensus " + inputPath + " " + BrushConfig.K); //conf.setFloat("Error_Rate", ErrorRate); //conf.setFloat("Exp_Cov", Exp_Cov); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(DefineConsensusMapper.class); conf.setReducerClass(DefineConsensusReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }