Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput.

Prototype

public static void setCompressOutput(Job job, boolean compress)

Source Link

Document

Set whether the output of the job is compressed.

Usage

From source file:edu.isi.mavuno.score.ScoreContexts.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf);
    String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);

    sLogger.info("Tool name: ScoreContexts");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Context scorer class: " + contextScorerClass);
    sLogger.info(" - Context scorer args: " + contextScorerArgs);

    Job job = new Job(conf);
    job.setJobName("ScoreContexts");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*from w w w.j  av a2  s .co m*/

    return 0;
}

From source file:edu.isi.mavuno.score.ScorePatterns.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.OutputPath", conf);
    String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);

    sLogger.info("Tool name: ScorePatterns");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Pattern scorer class: " + patternScorerClass);
    sLogger.info(" - Pattern scorer args: " + patternScorerArgs);

    Job job = new Job(conf);
    job.setJobName("ScorePatterns");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);//from   ww  w  .j a  v a  2  s. c o  m

    return 0;
}

From source file:edu.umd.cloud9.collection.aquaint2.Aquaint2DocnoMappingBuilder.java

License:Apache License

/**
 * Runs this tool.//from  ww w.  j a  v a2  s.c o m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];

    LOG.info("Tool: " + Aquaint2DocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    Job job = new Job(getConf(), Aquaint2DocnoMappingBuilder.class.getSimpleName());
    job.setJarByClass(Aquaint2DocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(Aquaint2DocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000";
    Aquaint2DocnoMapping.writeDocnoData(new Path(input), new Path(outputFile), FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents2.java

License:Apache License

/**
 * Runs this tool.//from  ww  w . j  a  va2s.com
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    Path inputDirPath = new Path(args[0]);
    String outputDirPathname = args[1];
    Path outputDirPath = new Path(outputDirPathname);
    Path outputFilePath = new Path(args[2]);

    LOG.info("Tool: " + NumberAquaint2Documents2.class.getCanonicalName());
    LOG.info(" - Input dir path: " + inputDirPath);
    LOG.info(" - Output dir path: " + outputDirPath);
    LOG.info(" - Output file path: " + outputFilePath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Job job = new Job(conf, NumberAquaint2Documents2.class.getSimpleName());
    job.setJarByClass(NumberAquaint2Documents2.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, inputDirPath);
    FileOutputFormat.setOutputPath(job, outputDirPath);
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(Aquaint2DocumentInputFormat2.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(outputDirPath, true);

    job.waitForCompletion(true);

    Path inputFilePath = new Path(
            outputDirPathname + (outputDirPathname.endsWith("/") ? "" : "/") + "/part-r-00000");
    Aquaint2DocnoMapping.writeDocnoData(inputFilePath, outputFilePath, FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.CountMedlineCitations.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));/*from  w ww  .  j a  v  a 2  s  .c  o  m*/
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    LOG.info("Tool: " + CountMedlineCitations.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), CountMedlineCitations.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(CountMedlineCitations.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");

    return numDocs;
}

From source file:edu.umd.cloud9.collection.medline.DemoCountMedlineCitations2.java

License:Apache License

/**
 * Runs this tool./*from  w w  w  . j a  v a2s.  com*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    LOG.info("Tool: " + DemoCountMedlineCitations2.class.getCanonicalName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), DemoCountMedlineCitations2.class.getSimpleName());
    job.setJarByClass(DemoCountMedlineCitations.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat2.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.MedlineDocnoMappingBuilder.java

License:Apache License

/**
 * Runs this tool./*from w ww. ja va 2  s. c  om*/
 */
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    if (options == null) {
        return -1;
    }

    // Temp directory.
    String tmpDir = "tmp-" + MedlineDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool: " + MedlineDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = new Job(getConf(), MedlineDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection);
    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(MedlineDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(options.collection));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    String input = tmpDir + (tmpDir.endsWith("/") ? "" : "/") + "/part-r-00000";
    MedlineDocnoMapping.writeMappingData(new Path(input), new Path(options.docnoMapping),
            FileSystem.get(getConf()));

    fs.delete(new Path(tmpDir), true);

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations2.java

License:Apache License

/**
 * Runs this tool.//from  w w w.  j  av  a2  s  . c om
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];

    LOG.info("Tool: " + NumberMedlineCitations2.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    Job job = new Job(getConf(), NumberMedlineCitations2.class.getSimpleName());
    job.setJarByClass(NumberMedlineCitations.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat2.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000";
    MedlineDocnoMapping.writeMappingData(new Path(input), new Path(outputFile), FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex2.java

License:Apache License

/**
 * Runs this tool.//from w w w  .ja v  a 2s  . c  o m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    Job job = new Job(getConf(), BuildTrecForwardIndex2.class.getCanonicalName());
    job.setJarByClass(BuildTrecForwardIndex2.class);
    FileSystem fs = FileSystem.get(getConf());

    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];
    String mappingFile = args[3];

    LOG.info("Tool name: " + BuildTrecForwardIndex2.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - mapping file: " + mappingFile);

    job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m");
    job.setNumReduceTasks(1);

    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }

    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat2.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();

    String inputFile = outputPath + "/" + "part-r-00000";

    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));

    FSDataOutputStream writer = fs.create(new Path(indexFile), true);

    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);

        writer.writeLong(offset);
        writer.writeInt(len);

        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");

    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.CountTrecDocuments.java

License:Apache License

/**
 * Runs this tool.//from  w  w  w. j ava  2s  . com
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("(optional) output file to write the number of records").create(COUNT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    LOG.info("Tool: " + CountTrecDocuments.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), CountTrecDocuments.class.getSimpleName());
    job.setJarByClass(CountTrecDocuments.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");

    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FileSystem fs = FileSystem.get(getConf());
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }

    return numDocs;
}