Example usage for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException

Source Link

Document

Gets the counters for this job.

Usage

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from  ww w . j a  va2s. c  o  m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    con.set(WINDOW_TYPE, mode.toString());
    con.setInt(WINDOW_SIZE, winsize);

    if (mode.toString().equalsIgnoreCase("DOCUMENT")) {
        con.setInt("mapred.job.map.memory.mb", 3000);

        con.set("mapred.child.java.opts", "-Xmx2900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx8000M");

        con.setInt("mapred.job.reduce.memory.mb", 8120);
    } else {
        con.setInt("mapred.job.map.memory.mb", 2000);

        con.set("mapred.child.java.opts", "-Xmx1900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx2900M");

        con.setInt("mapred.job.reduce.memory.mb", 3000);
    }
    con.setBoolean("mapred.compress.map.output", true);
    con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setBoolean("mapred.compress.output", true);
    con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setInt("mapred.task.timeout", 6000000);
    con.setInt("io.sort.factor", 50);
    con.setInt("mapreduce.map.tasks", 256);
    con.setInt("dfs.replication", 1);
    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(512);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:dz.lab.mapred.counter.StartsWithCountJob_PrintCounters.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // the following property will enable mapreduce to use its packaged local job runner
    //conf.set("mapreduce.framework.name", "local");

    Job job = Job.getInstance(conf, "StartsWithCountJob");
    job.setJarByClass(getClass());/* w w w.  ja v a 2 s  .  c  o  m*/

    // configure output and input source
    TextInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(StartsWithCountMapper.class);
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int resultCode = job.waitForCompletion(true) ? 0 : 1;
    System.out.println("Job is complete! Printing Counters:");
    Counters counters = job.getCounters();

    for (String groupName : counters.getGroupNames()) {
        CounterGroup group = counters.getGroup(groupName);
        System.out.println(group.getDisplayName());

        for (Counter counter : group.getUnderlyingGroup()) {
            System.out.println(" " + counter.getDisplayName() + "=" + counter.getValue());
        }
    }
    return resultCode;
}

From source file:edu.bigdata.training.core.mapreduce.WordCount.java

public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException {
    {// www .  jav a  2 s  .c o m

        System.out.println("arg[0]-->" + args[0]);
        System.out.println("arg[1]-->" + args[1]);

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(SimpleMapper.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);

        System.out.println("Total Words:" + job.getCounters().findCounter(METRICS.TOTAL_WORDS).getValue());
    }
}

From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    // required parameters
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf);

    // optional parameters
    String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf);
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf);

    sLogger.info("Tool name: ProcessStanfordNLP");
    sLogger.info(" - Input path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    if (suTime != null && Boolean.parseBoolean(suTime)) {
        sLogger.info("- SUTime enabled");
    }/*from  www .  j a  v  a 2s.co  m*/

    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        sLogger.info("- Text output format enabled");
        textOutputFormat = true;
    }

    Job job = new Job(conf);
    job.setJobName("ProcessStanfordNLP");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StanfordParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StanfordParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(ProcessStanfordNLP.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue());

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.TratzParse.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf);

    // optional parameter that allows the parsed documents to be output in text format
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf);
    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        textOutputFormat = true;//w  w  w . j  a va2 s. c o m
    }

    sLogger.info("Tool name: TratzParse");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("TratzParse");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TratzParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TratzParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(TratzParse.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue());
    sLogger.info(" - Total dropped sentences: "
            + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue());
    sLogger.info(
            " - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue());
    sLogger.info(
            " - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue());
    sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue());
    sLogger.info(" - Total named entity tagging time (ms): "
            + counters.findCounter(StatCounters.NETAG_TIME).getValue());
    sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue());

    return 0;
}

From source file:edu.rosehulman.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*w ww.jav a  2  s  .c o  m*/
 */
@SuppressWarnings("deprecation")
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:edu.umd.cloud9.collection.medline.CountMedlineCitations.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));//from   w  w  w  . ja v a2  s.c  o m
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    LOG.info("Tool: " + CountMedlineCitations.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), CountMedlineCitations.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(CountMedlineCitations.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");

    return numDocs;
}

From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex2.java

License:Apache License

/**
 * Runs this tool./*from   w ww.ja  va2 s .  c o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    Job job = new Job(getConf(), BuildTrecForwardIndex2.class.getCanonicalName());
    job.setJarByClass(BuildTrecForwardIndex2.class);
    FileSystem fs = FileSystem.get(getConf());

    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];
    String mappingFile = args[3];

    LOG.info("Tool name: " + BuildTrecForwardIndex2.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - mapping file: " + mappingFile);

    job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m");
    job.setNumReduceTasks(1);

    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }

    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat2.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();

    String inputFile = outputPath + "/" + "part-r-00000";

    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));

    FSDataOutputStream writer = fs.create(new Path(indexFile), true);

    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);

        writer.writeLong(offset);
        writer.writeInt(len);

        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");

    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }

    return 0;
}

From source file:edu.umd.cloud9.collection.trec.CountTrecDocuments.java

License:Apache License

/**
 * Runs this tool.//from www  . j  ava  2s  . c o m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("(optional) output file to write the number of records").create(COUNT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    LOG.info("Tool: " + CountTrecDocuments.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    Job job = new Job(getConf(), CountTrecDocuments.class.getSimpleName());
    job.setJarByClass(CountTrecDocuments.class);

    job.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");

    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FileSystem fs = FileSystem.get(getConf());
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }

    return numDocs;
}

From source file:edu.umd.cloud9.collection.trec.TrecForwardIndexBuilder.java

License:Apache License

/**
 * Runs this tool./* w  w  w  .j av a  2 s .  c  o m*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path")
            .create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data")
            .create(MAPPING_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    String tmpDir = "tmp-" + TrecForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    Job job = new Job(getConf(), TrecForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    job.setJarByClass(TrecForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(getConf());

    LOG.info("Tool name: " + TrecForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - DocnoMapping file: " + mappingFile);
    LOG.info(" - temp output directory: " + tmpDir);

    job.setNumReduceTasks(1);

    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }

    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(tmpDir), true);

    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();

    String inputFile = tmpDir + "/" + "part-r-00000";

    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));

    FSDataOutputStream writer = fs.create(new Path(indexFile), true);

    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);

        writer.writeLong(offset);
        writer.writeInt(len);

        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");

    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }

    fs.delete(new Path(tmpDir), true);

    return 0;
}