Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass) 

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:edu.umd.cloud9.collection.clue.RepackClueWarcRecords.java

License:Apache License

/**
 * Runs this tool./*from   w w  w .  ja v a  2  s  . c o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();
        return -1;
    }

    String basePath = args[0];
    String outputPath = args[1];
    int segment = Integer.parseInt(args[2]);
    String data = args[3];
    String compressionType = args[4];

    if (!compressionType.equals("block") && !compressionType.equals("record")
            && !compressionType.equals("none")) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        System.exit(-1);
    }

    // Default block size.
    int blocksize = 1000000;

    JobConf conf = new JobConf(RepackClueWarcRecords.class);
    conf.setJobName("RepackClueWarcRecords:segment" + segment);

    conf.set("DocnoMappingDataFile", data);

    LOG.info("Tool name: RepackClueWarcRecords");
    LOG.info(" - base path: " + basePath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - segment number: " + segment);
    LOG.info(" - docno mapping data file: " + data);
    LOG.info(" - compression type: " + compressionType);

    if (compressionType.equals("block")) {
        LOG.info(" - block size: " + blocksize);
    }

    int mapTasks = 10;

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment);

    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (compressionType.equals("none")) {
        SequenceFileOutputFormat.setCompressOutput(conf, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(conf, true);

        if (compressionType.equals("record")) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    conf.setInputFormat(ClueWarcInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ClueWarcRecord.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.line.DemoCountTextDocuments.java

License:Apache License

/**
 * Runs this tool./*from w w  w . java2s . c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 1) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];

    sLogger.info("input: " + inputPath);

    JobConf conf = new JobConf(DemoCountTextDocuments.class);
    conf.setJobName("DemoCountTextDocuments");

    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));

    conf.setInputFormat(TextDocumentInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.line.NumberTextDocuments.java

License:Apache License

/**
 * Runs this tool./*w  ww.j a v  a  2 s .c om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: NumberTextDocuments");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Output file: " + outputFile);
    sLogger.info("Launching with " + mapTasks + " mappers...");

    JobConf conf = new JobConf(getConf(), NumberTextDocuments.class);
    conf.setJobName("NumberTextDocuments");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TextDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-00000";
    TextDocnoMapping.writeDocnoData(input, outputFile, FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations.java

License:Apache License

/**
 * Runs this tool.//from  w  w  w .  jav a2  s  . co m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool name: NumberMedlineCitations");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Output file: " + outputFile);
    sLogger.info("Launching with " + mapTasks + " mappers...");

    JobConf conf = new JobConf(getConf(), NumberMedlineCitations.class);
    conf.setJobName("NumberMedlineCitations");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(MedlineCitationInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    // write out various properties
    Counters counters = job.getCounters();
    Counter counter = counters.findCounter("edu.umd.cloud9.collection.medline.NumberMedlineCitations$Citations",
            0, "");

    int numdocs = (int) counter.getCounter();
    sLogger.info("total number of docs: " + numdocs);

    MedlineDocnoMapping.writeDocidData(outputPath + "/part-00000", outputFile);

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.NumberTrecDocuments.java

License:Apache License

/**
 * Runs this tool.//from   w ww .  ja va 2  s  . com
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];

    LOG.info("Tool: " + NumberTrecDocuments.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    JobConf conf = new JobConf(getConf(), NumberTrecDocuments.class);
    conf.setJobName(NumberTrecDocuments.class.getSimpleName());

    conf.setNumMapTasks(100); // Arbitrary number; doesn't matter.
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TrecDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-00000";
    TrecDocnoMapping.writeMappingData(new Path(input), new Path(outputFile), FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.trecweb.NumberTrecWebDocuments.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 4) {
        System.out.println("usage: [input] [output-dir] [output-file] [num-mappers]");
        System.exit(-1);//w w w.ja v  a  2  s . c o m
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = Integer.parseInt(args[3]);

    LOG.info("Tool name: " + NumberTrecWebDocuments.class.getCanonicalName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - number of mappers: " + mapTasks);

    JobConf conf = new JobConf(getConf(), NumberTrecWebDocuments.class);
    conf.setJobName(NumberTrecWebDocuments.class.getSimpleName());

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    writeMappingData(new Path(outputPath + "/part-00000"), new Path(outputFile), FileSystem.get(conf));

    return 0;
}

From source file:edu.umd.cloud9.collection.trecweb.RepackGov2Documents.java

License:Apache License

/**
 * Runs this tool.// ww w  .  j av  a 2s . co m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String basePath = args[0];
    String outputPath = args[1];
    String compressionType = args[2];

    if (!compressionType.equals("block") && !compressionType.equals("record")
            && !compressionType.equals("none")) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        System.exit(-1);
    }

    // this is the default block size
    int blocksize = 1000000;

    JobConf conf = new JobConf(RepackGov2Documents.class);
    conf.setJobName("RepackGov2Documents");

    sLogger.info("Tool name: RepackGov2Documents");
    sLogger.info(" - base path: " + basePath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - compression type: " + compressionType);

    if (compressionType.equals("block")) {
        sLogger.info(" - block size: " + blocksize);
    }

    int mapTasks = 10;

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(500);

    // 272
    for (int i = 0; i <= 272; i++) {
        String path = basePath + "/GX";
        String indexNum = Integer.toString(i);
        if (indexNum.length() == 1) {
            path += "00";
        }
        if (indexNum.length() == 2) {
            path += "0";
        }
        path += indexNum;
        FileInputFormat.addInputPath(conf, new Path(path));
    }

    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (compressionType.equals("none")) {
        SequenceFileOutputFormat.setCompressOutput(conf, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(conf, true);

        if (compressionType.equals("record")) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    conf.setInputFormat(TrecWebDocumentInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(TrecWebDocument.class);

    conf.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.trecweb.RepackWt10gDocuments.java

License:Apache License

/**
 * Runs this tool.//from  ww  w. j  ava 2 s .c o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String basePath = args[0];
    String outputPath = args[1];
    String compressionType = args[2];

    if (!compressionType.equals("block") && !compressionType.equals("record")
            && !compressionType.equals("none")) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        System.exit(-1);
    }

    // this is the default block size
    int blocksize = 1000000;

    JobConf conf = new JobConf(RepackWt10gDocuments.class);
    conf.setJobName("RepackWt10gDocuments");

    sLogger.info("Tool name: RepackWt10gDocuments");
    sLogger.info(" - base path: " + basePath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - compression type: " + compressionType);

    if (compressionType.equals("block")) {
        sLogger.info(" - block size: " + blocksize);
    }

    int mapTasks = 10;

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(50);

    for (int i = 1; i <= 104; i++) {
        String path = basePath + "/WTX";
        String indexNum = Integer.toString(i);
        if (indexNum.length() == 1) {
            path += "00";
        }
        if (indexNum.length() == 2) {
            path += "0";
        }
        path += indexNum;
        FileInputFormat.addInputPath(conf, new Path(path));
    }

    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (compressionType.equals("none")) {
        SequenceFileOutputFormat.setCompressOutput(conf, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(conf, true);

        if (compressionType.equals("record")) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    conf.setInputFormat(TrecWebDocumentInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(TrecWebDocument.class);

    conf.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaDocnoMapping.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from   w ww .  j  a  v a2s  .  c o m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("tmp output directory")
            .create(OUTPUT_PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_PATH_OPTION)
            || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    JobConf conf = new JobConf(getConf(), BuildWikipediaDocnoMapping.class);
    conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    conf.setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);
    Counters c = job.getCounters();
    long cnt = keepAll ? c.getCounter(PageTypes.TOTAL) : c.getCounter(PageTypes.ARTICLE);

    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(conf), outputPath + "/part-00000", (int) cnt,
            outputFile);

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaLinkGraph.java

License:Apache License

private void task1(String inputPath, String outputPath, int partitions) throws IOException {
    LOG.info("Exracting edges...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), BuildWikipediaLinkGraph.class);
    conf.setJobName(String.format("BuildWikipediaLinkGraph:Edges[input: %s, output: %s, num_partitions: %d]",
            inputPath, outputPath, partitions));

    conf.setNumReduceTasks(partitions);//from w  ww . j a v a2 s .c o  m

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}