Example usage for org.apache.hadoop.mapred JobConf setOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputValueClass.

Prototype

public void setOutputValueClass(Class<?> theClass)

Source Link

Document

Set the value class for job outputs.

Usage

From source file:edu.umd.cloud9.collection.aquaint2.DemoCountAquaint2Documents.java

License:Apache License

/**
 * Runs this tool./* w  w w .j a v a  2 s.com*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];
    int mapTasks = Integer.parseInt(args[3]);

    System.out.println("input dir: " + inputPath);
    System.out.println("output dir: " + outputPath);
    System.out.println("mapping file: " + mappingFile);
    System.out.println("number of mappers: " + mapTasks);

    JobConf conf = new JobConf(DemoCountAquaint2Documents.class);
    conf.setJobName("DemoCountAquaint2Documents");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(Aquaint2DocumentInputFormatOld.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    // clean up
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return 0;
}

From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents.java

License:Apache License

/**
 * Runs this tool./*from www .  j  ava2  s .  c o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = 10;

    LOG.info("Tool: " + NumberAquaint2Documents.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    JobConf conf = new JobConf(NumberAquaint2Documents.class);
    conf.setJobName(NumberAquaint2Documents.class.getSimpleName());

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(Aquaint2DocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    Aquaint2DocnoMapping.writeDocnoData(new Path(outputPath + "/part-00000"), new Path(outputFile),
            FileSystem.get(conf));

    return 0;
}

From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndexBuilder.java

License:Apache License

/**
 * Runs this tool.// w  ww .j  ava  2s.  co m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("(required) collection path (must be block-compressed SequenceFiles)")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path")
            .create(INDEX_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);

    LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    Random random = new Random();
    Path outputPath = new Path(
            "tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));

    conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);

    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);

    // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat
    // thinks its a MapFile.
    for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
        FileInputFormat.addInputPath(conf, status.getPath());
    }
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(outputPath, true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    fs.delete(outputPath, true);
    return 0;
}

From source file:edu.umd.cloud9.collection.clue.RepackClueWarcRecords.java

License:Apache License

/**
 * Runs this tool./*from  ww  w.j  av  a 2  s. c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();
        return -1;
    }

    String basePath = args[0];
    String outputPath = args[1];
    int segment = Integer.parseInt(args[2]);
    String data = args[3];
    String compressionType = args[4];

    if (!compressionType.equals("block") && !compressionType.equals("record")
            && !compressionType.equals("none")) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        System.exit(-1);
    }

    // Default block size.
    int blocksize = 1000000;

    JobConf conf = new JobConf(RepackClueWarcRecords.class);
    conf.setJobName("RepackClueWarcRecords:segment" + segment);

    conf.set("DocnoMappingDataFile", data);

    LOG.info("Tool name: RepackClueWarcRecords");
    LOG.info(" - base path: " + basePath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - segment number: " + segment);
    LOG.info(" - docno mapping data file: " + data);
    LOG.info(" - compression type: " + compressionType);

    if (compressionType.equals("block")) {
        LOG.info(" - block size: " + blocksize);
    }

    int mapTasks = 10;

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment);

    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if (compressionType.equals("none")) {
        SequenceFileOutputFormat.setCompressOutput(conf, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(conf, true);

        if (compressionType.equals("record")) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    conf.setInputFormat(ClueWarcInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ClueWarcRecord.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.line.NumberTextDocuments.java

License:Apache License

/**
 * Runs this tool.//from  www  .  j  a  v  a  2  s.com
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: NumberTextDocuments");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Output file: " + outputFile);
    sLogger.info("Launching with " + mapTasks + " mappers...");

    JobConf conf = new JobConf(getConf(), NumberTextDocuments.class);
    conf.setJobName("NumberTextDocuments");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TextDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-00000";
    TextDocnoMapping.writeDocnoData(input, outputFile, FileSystem.get(getConf()));

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.DemoCountMedlineCitations.java

License:Apache License

/**
 * Runs this tool.//from   w w w .  j a va 2s.com
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    sLogger.info("input: " + inputPath);
    sLogger.info("output dir: " + outputPath);
    sLogger.info("docno mapping file: " + mappingFile);

    JobConf conf = new JobConf(DemoCountMedlineCitations.class);
    conf.setJobName("DemoCountMedlineCitations");

    conf.setNumReduceTasks(0);

    // pass in the class name as a String; this is makes the mapper general
    // in being able to load any collection of Indexable objects that has
    // docid/docno mapping specified by a DocnoMapping object
    conf.set("DocnoMappingClass", "edu.umd.cloud9.collection.medline.MedlineDocnoMapping");

    // put the mapping file in the distributed cache so each map worker will
    // have it
    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(MedlineCitationInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.collection.medline.NumberMedlineCitations.java

License:Apache License

/**
 * Runs this tool./*from   w w w  .ja  v a  2  s  .c  om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool name: NumberMedlineCitations");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Output file: " + outputFile);
    sLogger.info("Launching with " + mapTasks + " mappers...");

    JobConf conf = new JobConf(getConf(), NumberMedlineCitations.class);
    conf.setJobName("NumberMedlineCitations");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(MedlineCitationInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    // write out various properties
    Counters counters = job.getCounters();
    Counter counter = counters.findCounter("edu.umd.cloud9.collection.medline.NumberMedlineCitations$Citations",
            0, "");

    int numdocs = (int) counter.getCounter();
    sLogger.info("total number of docs: " + numdocs);

    MedlineDocnoMapping.writeDocidData(outputPath + "/part-00000", outputFile);

    return 0;
}

From source file:edu.umd.cloud9.collection.spinn3r.DemoCountSpinn3rEnglishPosts.java

License:Apache License

/**
 * Runs this tool./*from  w  w w  .  j  av a2  s  . c  o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    sLogger.info("input dir: " + inputPath);
    sLogger.info("output dir: " + outputPath);

    JobConf conf = new JobConf(DemoCountSpinn3rEnglishPosts.class);
    conf.setJobName("DemoCountSpinn3rEnglishPosts");

    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(Spinn3rItemInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    // clean up
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex.java

License:Apache License

/**
 * Runs this tool.//from   www .j a v  a2s .  com
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    JobConf conf = new JobConf(getConf(), BuildTrecForwardIndex.class);
    FileSystem fs = FileSystem.get(getConf());

    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];
    String mappingFile = args[3];

    LOG.info("Tool name: " + BuildTrecForwardIndex.class.getCanonicalName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - mapping file: " + mappingFile);

    conf.setJobName(BuildTrecForwardIndex.class.getSimpleName());

    conf.set("mapred.child.java.opts", "-Xmx1024m");
    conf.setNumReduceTasks(1);

    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), conf);
    }

    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TrecDocumentInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getCounter();

    String inputFile = outputPath + "/" + "part-00000";

    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    FSLineReader reader = new FSLineReader(inputFile, fs);

    FSDataOutputStream writer = fs.create(new Path(indexFile), true);

    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);

        writer.writeLong(offset);
        writer.writeInt(len);

        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");

    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.DemoCountTrecDocuments.java

License:Apache License

/**
 * Runs this tool.//from   w ww .jav  a  2 s . c o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    LOG.info("Tool: " + DemoCountTrecDocuments.class.getCanonicalName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    JobConf conf = new JobConf(getConf(), DemoCountTrecDocuments.class);
    conf.setJobName(DemoCountTrecDocuments.class.getSimpleName());

    conf.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    conf.set("DocnoMappingClass", edu.umd.cloud9.collection.trec.TrecDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TrecDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}