List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput
public static void setCompressOutput(Job job, boolean compress)
From source file:edu.isi.mavuno.score.ScoreContexts.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf); String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf); String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf); sLogger.info("Tool name: ScoreContexts"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Context scorer class: " + contextScorerClass); sLogger.info(" - Context scorer args: " + contextScorerArgs); Job job = new Job(conf); job.setJobName("ScoreContexts"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ScoreWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/*from w w w.j av a2 s .co m*/ return 0; }
From source file:edu.isi.mavuno.score.ScorePatterns.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.OutputPath", conf); String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf); String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf); sLogger.info("Tool name: ScorePatterns"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Pattern scorer class: " + patternScorerClass); sLogger.info(" - Pattern scorer args: " + patternScorerArgs); Job job = new Job(conf); job.setJobName("ScorePatterns"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ScoreWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);//from ww w .j a v a 2 s. c o m return 0; }
From source file:edu.umd.cloud9.collection.aquaint2.Aquaint2DocnoMappingBuilder.java
License:Apache License
/** * Runs this tool.//from ww w. j a v a2 s.c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; LOG.info("Tool: " + Aquaint2DocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - Input path: " + inputPath); LOG.info(" - Output path: " + outputPath); LOG.info(" - Output file: " + outputFile); Job job = new Job(getConf(), Aquaint2DocnoMappingBuilder.class.getSimpleName()); job.setJarByClass(Aquaint2DocnoMappingBuilder.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(Aquaint2DocumentInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000"; Aquaint2DocnoMapping.writeDocnoData(new Path(input), new Path(outputFile), FileSystem.get(getConf())); return 0; }
From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents2.java
License:Apache License
/** * Runs this tool.//from ww w . j a va2s.com */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } Path inputDirPath = new Path(args[0]); String outputDirPathname = args[1]; Path outputDirPath = new Path(outputDirPathname); Path outputFilePath = new Path(args[2]); LOG.info("Tool: " + NumberAquaint2Documents2.class.getCanonicalName()); LOG.info(" - Input dir path: " + inputDirPath); LOG.info(" - Output dir path: " + outputDirPath); LOG.info(" - Output file path: " + outputFilePath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, NumberAquaint2Documents2.class.getSimpleName()); job.setJarByClass(NumberAquaint2Documents2.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, inputDirPath); FileOutputFormat.setOutputPath(job, outputDirPath); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(Aquaint2DocumentInputFormat2.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(outputDirPath, true); job.waitForCompletion(true); Path inputFilePath = new Path( outputDirPathname + (outputDirPathname.endsWith("/") ? "" : "/") + "/part-r-00000"); Aquaint2DocnoMapping.writeDocnoData(inputFilePath, outputFilePath, FileSystem.get(getConf())); return 0; }
From source file:edu.umd.cloud9.collection.medline.CountMedlineCitations.java
License:Apache License
@SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path") .create(COLLECTION_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path") .create(OUTPUT_OPTION));/*from w ww . j a v a 2 s .c o m*/ options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data") .create(MAPPING_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(COLLECTION_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_OPTION); LOG.info("Tool: " + CountMedlineCitations.class.getSimpleName()); LOG.info(" - input: " + inputPath); LOG.info(" - output dir: " + outputPath); LOG.info(" - docno mapping file: " + mappingFile); Job job = new Job(getConf(), CountMedlineCitations.class.getSimpleName() + ":" + inputPath); job.setJarByClass(CountMedlineCitations.class); job.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general in being able to load // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping // object. job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName()); // Put the mapping file in the distributed cache so each map worker will have it. DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration()); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(MedlineCitationInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Count.DOCS).getValue(); LOG.info("Read " + numDocs + " docs."); return numDocs; }
From source file:edu.umd.cloud9.collection.medline.DemoCountMedlineCitations2.java
License:Apache License
/** * Runs this tool./*from w w w . j a v a2s. com*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; LOG.info("Tool: " + DemoCountMedlineCitations2.class.getCanonicalName()); LOG.info(" - input: " + inputPath); LOG.info(" - output dir: " + outputPath); LOG.info(" - docno mapping file: " + mappingFile); Job job = new Job(getConf(), DemoCountMedlineCitations2.class.getSimpleName()); job.setJarByClass(DemoCountMedlineCitations.class); job.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general in being able to load // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping // object. job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName()); // Put the mapping file in the distributed cache so each map worker will have it. DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration()); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(MedlineCitationInputFormat2.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.medline.MedlineDocnoMappingBuilder.java
License:Apache License
/** * Runs this tool./*from w ww. ja va 2 s. c om*/ */ public int run(String[] args) throws IOException { DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args); if (options == null) { return -1; } // Temp directory. String tmpDir = "tmp-" + MedlineDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000); LOG.info("Tool: " + MedlineDocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - input path: " + options.collection); LOG.info(" - output file: " + options.docnoMapping); Job job = new Job(getConf(), MedlineDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(MedlineDocnoMappingBuilder.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(options.collection)); FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(MedlineCitationInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpDir), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } String input = tmpDir + (tmpDir.endsWith("/") ? "" : "/") + "/part-r-00000"; MedlineDocnoMapping.writeMappingData(new Path(input), new Path(options.docnoMapping), FileSystem.get(getConf())); fs.delete(new Path(tmpDir), true); return 0; }
From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations2.java
License:Apache License
/** * Runs this tool.//from w w w. j av a2 s . c om */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; LOG.info("Tool: " + NumberMedlineCitations2.class.getCanonicalName()); LOG.info(" - Input path: " + inputPath); LOG.info(" - Output path: " + outputPath); LOG.info(" - Output file: " + outputFile); Job job = new Job(getConf(), NumberMedlineCitations2.class.getSimpleName()); job.setJarByClass(NumberMedlineCitations.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(MedlineCitationInputFormat2.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000"; MedlineDocnoMapping.writeMappingData(new Path(input), new Path(outputFile), FileSystem.get(getConf())); return 0; }
From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex2.java
License:Apache License
/** * Runs this tool.//from w w w .ja v a 2s . c o m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } Job job = new Job(getConf(), BuildTrecForwardIndex2.class.getCanonicalName()); job.setJarByClass(BuildTrecForwardIndex2.class); FileSystem fs = FileSystem.get(getConf()); String collectionPath = args[0]; String outputPath = args[1]; String indexFile = args[2]; String mappingFile = args[3]; LOG.info("Tool name: " + BuildTrecForwardIndex2.class.getSimpleName()); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - mapping file: " + mappingFile); job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m"); job.setNumReduceTasks(1); if (job.getConfiguration().get("mapred.job.tracker").equals("local")) { job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile); } else { DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration()); } FileInputFormat.setInputPaths(job, new Path(collectionPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(TrecDocumentInputFormat2.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Count.DOCS).getValue(); String inputFile = outputPath + "/" + "part-r-00000"; LOG.info("Writing " + numDocs + " doc offseta to " + indexFile); LineReader reader = new LineReader(fs.open(new Path(inputFile))); FSDataOutputStream writer = fs.create(new Path(indexFile), true); writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName()); writer.writeUTF(collectionPath); writer.writeInt(numDocs); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\t"); long offset = Long.parseLong(arr[1]); int len = Integer.parseInt(arr[2]); writer.writeLong(offset); writer.writeInt(len); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " docs"); } } reader.close(); writer.close(); LOG.info(cnt + " docs total. Done!"); if (numDocs != cnt) { throw new RuntimeException("Unexpected number of documents in building forward index!"); } return 0; }
From source file:edu.umd.cloud9.collection.trec.CountTrecDocuments.java
License:Apache License
/** * Runs this tool.//from w w w. j ava 2s . com */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path") .create(COLLECTION_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data") .create(MAPPING_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("(optional) output file to write the number of records").create(COUNT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(COLLECTION_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_OPTION); LOG.info("Tool: " + CountTrecDocuments.class.getSimpleName()); LOG.info(" - input: " + inputPath); LOG.info(" - output dir: " + outputPath); LOG.info(" - docno mapping file: " + mappingFile); Job job = new Job(getConf(), CountTrecDocuments.class.getSimpleName()); job.setJarByClass(CountTrecDocuments.class); job.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general in being able to load // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping // object. job.getConfiguration().set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName()); // Put the mapping file in the distributed cache so each map worker will have it. DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration()); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(TrecDocumentInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Count.DOCS).getValue(); LOG.info("Read " + numDocs + " docs."); if (cmdline.hasOption(COUNT_OPTION)) { String f = cmdline.getOptionValue(COUNT_OPTION); FileSystem fs = FileSystem.get(getConf()); FSDataOutputStream out = fs.create(new Path(f)); out.write(new Integer(numDocs).toString().getBytes()); out.close(); } return numDocs; }