List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass
public void setOutputKeyClass(Class<?> theClass)
From source file:azkaban.jobtype.examples.java.WordCount.java
License:Apache License
public void run() throws Exception { logger.info(String.format("Starting %s", getClass().getSimpleName())); // hadoop conf should be on the classpath JobConf jobconf = getJobConf(); jobconf.setJarByClass(WordCount.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(IntWritable.class); jobconf.setMapperClass(Map.class); jobconf.setReducerClass(Reduce.class); jobconf.setInputFormat(TextInputFormat.class); jobconf.setOutputFormat(TextOutputFormat.class); FileInputFormat.addInputPath(jobconf, new Path(inputPath)); FileOutputFormat.setOutputPath(jobconf, new Path(outputPath)); if (forceOutputOverrite) { FileSystem fs = FileOutputFormat.getOutputPath(jobconf).getFileSystem(jobconf); fs.delete(FileOutputFormat.getOutputPath(jobconf), true); }/* w w w .j a va 2s. c om*/ super.run(); }
From source file:babel.prep.corpus.CorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. *///from ww w.j a va 2s. c om protected JobConf createJobConf(String crawlDir, String pagesSubDir, boolean xmlOut) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create " + (xmlOut ? "xml formatted" : "") + " dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CorpusGenMapper.class); job.setOutputFormat(xmlOut ? MultipleXMLLangFileOutputFormat.class : MultipleLangFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "corpus." + (xmlOut ? PARAM_XML + "." : "") + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. *//*from w w w .j av a2s.co m*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create dated dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(DatedCorpusGenMapper.class); job.setReducerClass(DatedCorpusGenReducer.class); job.setMapOutputValueClass(PageVersion.class); job.setOutputFormat(DatedLangFilesOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.extract.NutchPageExtractor.java
License:Apache License
/** * Configures the extraction job.//from w w w . j a v a 2s. co m */ protected JobConf createJobConf(String crawlDir) throws IOException { Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR); List<Path> segPaths = allSegmentDirs(segmentsPath); StringBuilder allSegNames = new StringBuilder(); for (int i = 0; i < segPaths.size(); i++) { allSegNames.append(" " + segPaths.get(i).getName()); } String timeStamp = getCurTimeStamp(); JobConf job = new NutchJob(getConf()); job.setJobName("read segments" + allSegNames.toString()); // Specify what info to extract job.setBoolean("segment.reader.co", m_co); job.setBoolean("segment.reader.fe", m_fe); job.setBoolean("segment.reader.ge", m_ge); job.setBoolean("segment.reader.pa", m_pa); job.setBoolean("segment.reader.pd", m_pd); job.setBoolean("segment.reader.pt", m_pt); // Specify the paths to extract from for each segment for (int i = 0; i < segPaths.size(); i++) { if (m_ge) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME)); if (m_fe) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME)); if (m_pa) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME)); if (m_co) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME)); if (m_pd) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME)); if (m_pt) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME)); } // Specify the segments directory so that mapper can recover segment info job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName()); // Store the start time/date of this job job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(PageExtMapper.class); job.setReducerClass(PageExtReducer.class); job.setMapOutputValueClass(NutchChunk.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.langid.LangIdentifier.java
License:Apache License
/** * Configures a map-only language id job. *///from w w w . j a v a 2 s.co m protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangIdMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langid." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }
From source file:babel.prep.langidtime.LangAndTimeExtractor.java
License:Apache License
/** * Configures a map-only language id job. *//*from www. j a va2 s.c om*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages and collect time for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangAndTimeMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); //ANNI EDIT job.setNumMapTasks(2); job.setNumReduceTasks(2); //END ANNI EDIT FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langidtime." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }
From source file:babel.prep.merge.PageMerger.java
License:Apache License
/** * Configures a reduce-only page merge job. *//*w w w.j a va 2 s. co m*/ protected JobConf createJobConf(String crawlDir, String pagesSubDirOne, String pagesSubDirTwo) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("merge pages in " + pagesSubDirOne + " and " + pagesSubDirTwo); job.setInputFormat(SequenceFileInputFormat.class); job.setReducerClass(PageMergeReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDirOne)); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDirTwo)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.merge." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:boa.datagen.SeqSort.java
License:Apache License
/** * The main driver for sort program.//from w w w . ja v a2s .co m * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ @Override public int run(String[] args) throws Exception { System.out.println(inPath); JobConf jobConf = new JobConf(getConf(), SeqSort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(BytesWritable.class); SequenceFileOutputFormat.setCompressOutput(jobConf, true); SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class); SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK); // Make sure there are exactly 2 parameters left. FileInputFormat.setInputPaths(jobConf, inPath); FileOutputFormat.setOutputPath(jobConf, new Path(outPath)); System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:br.eti.kinoshita.hadoop.WordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJarByClass(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); //FileInputFormat.setInputPaths(conf, new Path("hdfs://chuva:9000/test/leiseca.")); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);// w ww.ja v a2s .c o m }
From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMain.java
License:Open Source License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(IndexerMain.class); conf.setJobName("indexer"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IndexerMap.class); conf.setCombinerClass(IndexerReduce.class); conf.setReducerClass(IndexerReduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//from w w w . j a va2 s.c o m }