List of usage examples for org.apache.hadoop.mapred JobConf setNumMapTasks
public void setNumMapTasks(int n)
From source file:edu.umd.cloud9.collection.clue.CountClueWarcRecords.java
License:Apache License
/** * Runs this tool.//from w w w.j a va2 s .com */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution")); options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles")); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path") .create(MAPPING_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("segment number (required if 'original')").create(SEGMENT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("output file to write the number of records").create(COUNT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } boolean repacked; if (cmdline.hasOption(REPACKED_OPTION)) { repacked = true; } else if (cmdline.hasOption(ORIGINAL_OPTION)) { repacked = false; } else { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Expecting either -original or -repacked"); return -1; } if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String path = cmdline.getOptionValue(PATH_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_OPTION); int segment = 1; if (!repacked) { segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION)); } LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName()); LOG.info(" - repacked: " + repacked); LOG.info(" - path: " + path); LOG.info(" - mapping file: " + mappingFile); if (!repacked) { LOG.info(" - segment number: " + segment); } FileSystem fs = FileSystem.get(getConf()); int mapTasks = 10; JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class); conf.setJobName( CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment)); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); if (repacked) { // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat // thinks its a MapFile. for (FileStatus status : fs.listStatus(new Path(path))) { FileInputFormat.addInputPath(conf, status.getPath()); } } else { ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment); } DistributedCache.addCacheFile(new URI(mappingFile), conf); if (repacked) { conf.setInputFormat(SequenceFileInputFormat.class); } else { conf.setInputFormat(ClueWarcInputFormat.class); } conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); if (cmdline.hasOption(COUNT_OPTION)) { String f = cmdline.getOptionValue(COUNT_OPTION); FSDataOutputStream out = fs.create(new Path(f)); out.write(new Integer(numDocs).toString().getBytes()); out.close(); } return 0; }
From source file:edu.umd.cloud9.collection.clue.RepackClueWarcRecords.java
License:Apache License
/** * Runs this tool./*from www .j a v a 2s.com*/ */ public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; int segment = Integer.parseInt(args[2]); String data = args[3]; String compressionType = args[4]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // Default block size. int blocksize = 1000000; JobConf conf = new JobConf(RepackClueWarcRecords.class); conf.setJobName("RepackClueWarcRecords:segment" + segment); conf.set("DocnoMappingDataFile", data); LOG.info("Tool name: RepackClueWarcRecords"); LOG.info(" - base path: " + basePath); LOG.info(" - output path: " + outputPath); LOG.info(" - segment number: " + segment); LOG.info(" - docno mapping data file: " + data); LOG.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { LOG.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(ClueWarcInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ClueWarcRecord.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.line.NumberTextDocuments.java
License:Apache License
/** * Runs this tool.//from ww w .j a v a 2 s .c o m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; int mapTasks = Integer.parseInt(args[3]); sLogger.info("Tool: NumberTextDocuments"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Output file: " + outputFile); sLogger.info("Launching with " + mapTasks + " mappers..."); JobConf conf = new JobConf(getConf(), NumberTextDocuments.class); conf.setJobName("NumberTextDocuments"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TextDocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-00000"; TextDocnoMapping.writeDocnoData(input, outputFile, FileSystem.get(getConf())); return 0; }
From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations.java
License:Apache License
/** * Runs this tool./*from w ww . j a v a2 s. c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; int mapTasks = Integer.parseInt(args[3]); sLogger.info("Tool name: NumberMedlineCitations"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Output file: " + outputFile); sLogger.info("Launching with " + mapTasks + " mappers..."); JobConf conf = new JobConf(getConf(), NumberMedlineCitations.class); conf.setJobName("NumberMedlineCitations"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(MedlineCitationInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); // write out various properties Counters counters = job.getCounters(); Counter counter = counters.findCounter("edu.umd.cloud9.collection.medline.NumberMedlineCitations$Citations", 0, ""); int numdocs = (int) counter.getCounter(); sLogger.info("total number of docs: " + numdocs); MedlineDocnoMapping.writeDocidData(outputPath + "/part-00000", outputFile); return 0; }
From source file:edu.umd.cloud9.collection.trec.NumberTrecDocuments.java
License:Apache License
/** * Runs this tool./*from w ww . j a v a 2s . c om*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; LOG.info("Tool: " + NumberTrecDocuments.class.getCanonicalName()); LOG.info(" - Input path: " + inputPath); LOG.info(" - Output path: " + outputPath); LOG.info(" - Output file: " + outputFile); JobConf conf = new JobConf(getConf(), NumberTrecDocuments.class); conf.setJobName(NumberTrecDocuments.class.getSimpleName()); conf.setNumMapTasks(100); // Arbitrary number; doesn't matter. conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-00000"; TrecDocnoMapping.writeMappingData(new Path(input), new Path(outputFile), FileSystem.get(getConf())); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.NumberTrecWebDocuments.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 4) { System.out.println("usage: [input] [output-dir] [output-file] [num-mappers]"); System.exit(-1);// w ww.j a v a 2 s . co m } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; int mapTasks = Integer.parseInt(args[3]); LOG.info("Tool name: " + NumberTrecWebDocuments.class.getCanonicalName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - output file: " + outputFile); LOG.info(" - number of mappers: " + mapTasks); JobConf conf = new JobConf(getConf(), NumberTrecWebDocuments.class); conf.setJobName(NumberTrecWebDocuments.class.getSimpleName()); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); writeMappingData(new Path(outputPath + "/part-00000"), new Path(outputFile), FileSystem.get(conf)); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackGov2Documents.java
License:Apache License
/** * Runs this tool.//from w ww . j a va 2s . c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; String compressionType = args[2]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // this is the default block size int blocksize = 1000000; JobConf conf = new JobConf(RepackGov2Documents.class); conf.setJobName("RepackGov2Documents"); sLogger.info("Tool name: RepackGov2Documents"); sLogger.info(" - base path: " + basePath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { sLogger.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(500); // 272 for (int i = 0; i <= 272; i++) { String path = basePath + "/GX"; String indexNum = Integer.toString(i); if (indexNum.length() == 1) { path += "00"; } if (indexNum.length() == 2) { path += "0"; } path += indexNum; FileInputFormat.addInputPath(conf, new Path(path)); } SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(TrecWebDocumentInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TrecWebDocument.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackWt10gDocuments.java
License:Apache License
/** * Runs this tool.//from w w w . j a va 2 s . com */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; String compressionType = args[2]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // this is the default block size int blocksize = 1000000; JobConf conf = new JobConf(RepackWt10gDocuments.class); conf.setJobName("RepackWt10gDocuments"); sLogger.info("Tool name: RepackWt10gDocuments"); sLogger.info(" - base path: " + basePath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { sLogger.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(50); for (int i = 1; i <= 104; i++) { String path = basePath + "/WTX"; String indexNum = Integer.toString(i); if (indexNum.length() == 1) { path += "00"; } if (indexNum.length() == 2) { path += "0"; } path += indexNum; FileInputFormat.addInputPath(conf, new Path(path)); } SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(TrecWebDocumentInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TrecWebDocument.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.DemoCountWikipediaPages.java
License:Apache License
@SuppressWarnings("static-access") @Override//from ww w .j a v a 2 s . com public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); JobConf conf = new JobConf(getConf(), DemoCountWikipediaPages.class); conf.setJobName(String.format("DemoCountWikipediaPages[%s: %s]", INPUT_OPTION, inputPath)); conf.setNumMapTasks(10); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, new Path(inputPath)); conf.setInputFormat(WikipediaPageInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.demo.DemoWordCondProbJSON.java
License:Apache License
/** * Runs this tool./*from ww w . ja v a 2 s . co m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: DemoWordCondProbJSON"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(DemoWordCondProbJSON.class); conf.setJobName("DemoWordCondProbJSON"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setOutputKeyClass(MyTuple.class); conf.setOutputValueClass(FloatWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); // this is a potential gotcha! can't use ReduceClass for combine because // we have not collected all the counts yet, so we can't divide through // to compute the conditional probabilities conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }