Example usage for org.apache.hadoop.mapreduce Job setJobName

List of usage examples for org.apache.hadoop.mapreduce Job setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setJobName.

Prototype

public void setJobName(String name) throws IllegalStateException 

Source Link

Document

Set the user-specified job name.

Usage

From source file:edu.gslis.ts.hadoop.ThriftBulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];/*from  w w  w  . j  a v  a  2  s  .c o  m*/
    String inputPath = args[1];
    String outputPath = args[2];
    Path topicsFile = new Path(args[3]);
    Path vocabFile = new Path(args[4]);
    Path dateBinFile = new Path(args[5]);

    Configuration config = getConf();
    config.set("hbase.table.name", tableName);
    HBaseConfiguration.addHbaseResources(config);

    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
    job.setJobName("Bulk Loading HBase Table::" + tableName);
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapperClass(ThriftFilterMapper.class);

    Path output = new Path(outputPath);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapOutputValueClass(Put.class);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
    job.addCacheFile(dateBinFile.toUri());

    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapred.map.output.compression.codec",
            org.apache.hadoop.io.compress.SnappyCodec.class,
            org.apache.hadoop.io.compress.CompressionCodec.class);
    job.getConfiguration().set("hfile.compression", Compression.Algorithm.SNAPPY.getName());

    //RegionLocator regionLocator = conn.getRegionLocator(tableName);
    //HFileOutputFormat2.configureIncrementalLoad(job, new HTable(config,tableName));

    Connection con = ConnectionFactory.createConnection(config);
    TableName htableName = TableName.valueOf(tableName);
    HFileOutputFormat2.configureIncrementalLoad(job, con.getTable(htableName),
            con.getRegionLocator(htableName));

    job.waitForCompletion(true);
    if (job.isSuccessful()) {
        // Couldn't find a better way to do this. The LoadIncrementalHFiles
        // seems to want 777 permissions on the output directory.
        try {
            Runtime rt = Runtime.getRuntime();
            rt.exec("hadoop fs -chmod -R 777 " + output);
        } catch (Exception e) {
            e.printStackTrace();
        }
        /*
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
        HTable htable = new HTable(config, tableName);
        loader.doBulkLoad(new Path(outputPath), htable);
        */

    } else {
        throw new IOException("error with job");
    }

    return 0;

    // - 

    /*
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
            
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
    job.setMapOutputValueClass(Put.class);  
    job.setInputFormatClass(ThriftFileInputFormat.class);
            
    //HFileOutputFormat2.configureIncrementalLoad(job, htable);
            
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));        
            
    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
            
    job.setMapperClass(ThriftFilterMapper.class);
            
    boolean b = job.waitForCompletion(true);
    if (!b) {
    throw new IOException("error with job");
    }
            
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
    loader.doBulkLoad(new Path(outputPath), htable);
            
    return 0;        
    */
}

From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();//from ww w  .jav  a  2 s .c o  m
    }

    // all directories are in HDFS
    tokenizedDocDir = args[0];
    dictDir = args[1];
    outputDir = args[2];
    numReducers = Integer.valueOf(args[3]);

    logger.info("PartialVectorsFromTokenizedDoc ");
    logger.info(" - tokenizedDocDir: " + tokenizedDocDir);
    logger.info(" - dictDir: " + dictDir);
    logger.info(" - outputDir: " + outputDir);
    logger.info(" - numReducers: " + numReducers);

    Path tokenizedDocPath = new Path(tokenizedDocDir);
    Path dictPath = new Path(dictDir);
    Path outputPath = new Path(outputDir);

    // get dimension
    Configuration conf = getConf();

    int dimension = 0;
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true,
            conf)) {
        dimension++;
    }
    logger.info("dimension of a vector: " + dimension);

    // submit job
    long t0 = System.currentTimeMillis();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir
            + ", dictionary-file: " + dictDir);
    job.setJarByClass(PartialVectorsFromTokenizedDoc.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, tokenizedDocPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    HadoopUtil.delete(conf, outputPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    job.waitForCompletion(true);

    long t1 = System.currentTimeMillis();
    logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds.");

    return 0;
}

From source file:edu.isi.mavuno.app.ie.ExtractRelations.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String typesPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.TypesPath", conf);
    String primaryTypes = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PrimaryTypes", conf);
    String patternsPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PatternsPath", conf);
    String instancesPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.InstancesPath", conf);
    String plaintextPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.PlaintextPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.CorpusPath", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorArgs", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.OutputPath", conf);

    sLogger.info("Tool name: ExtractRelations");
    sLogger.info(" - Types path: " + typesPath);
    sLogger.info(" - Primary types: " + primaryTypes);
    sLogger.info(" - Patterns path: " + patternsPath);
    if (instancesPath != null) {
        sLogger.info(" - Instances path: " + instancesPath);
    }//from www . j av  a2s  . c om
    if (plaintextPath != null) {
        sLogger.info(" - Plaintext path: " + plaintextPath);
    }
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor arguments: " + extractorArgs);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("ExtractRelations");

    FileInputFormat.addInputPath(job, new Path(corpusPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.app.ie.HarvestSAPInstances.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusClass", conf);
    int minMatches = Integer
            .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.MinMatches", conf));
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.OutputPath", conf);

    sLogger.info("Tool name: HarvestSAPInstances");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Minimum matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestSAPInstances");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);//from   www  . j a va 2s. c  o  m
    return 0;
}

From source file:edu.isi.mavuno.app.ie.HarvestUDAPInstances.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.OutputPath", conf);

    sLogger.info("Tool name: HarvestUDAPInstances");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestUDAPInstances");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*  w w  w . j  a v  a 2 s  . c o m*/
    return 0;
}

From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass",
            conf);/*from w ww  .j  a v a2  s .  c o m*/
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs",
            conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf);

    sLogger.info("Tool name: HarvestContextPatternPairs");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestContextPatternPairs");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass",
            conf);/*www .java  2 s .c  om*/
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs",
            conf);
    String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf);

    MavunoUtils.createDirectory(conf, outputPath);

    sLogger.info("Tool name: HarvestParaphraseCandidates");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestParaphraseCandidates");

    // harvest all (context, pattern) triples
    conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath);
    conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass);
    conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass);
    conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs);
    conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches);
    conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples");
    new HarvestContextPatternPairs(conf).run();

    FileInputFormat.addInputPath(job, new Path(outputPath + "/triples"));
    FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all"));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
    job.setMapOutputValueClass(TextLongPairWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    // combine scores
    //      conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all");
    //      conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns");
    //      new CombineScores(conf).run();
    //            
    // only retain the top paraphrases
    conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all");
    conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k");
    conf.set("Mavuno.GetTopResults.NumResults", numResults);
    conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false);
    new GetTopResults(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all");

    return 0;
}

From source file:edu.isi.mavuno.app.mine.HarvestSentences.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String patternPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.PatternPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.OutputPath", conf);

    sLogger.info("Tool name: HarvestSentences");
    sLogger.info(" - Pattern file: " + patternPath);
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestSentences");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setNumReduceTasks(0);//from   w  w  w .  jav  a  2 s  .c o m

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.HarvestParseGraph.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.OutputPath", conf);

    sLogger.info("Tool name: HarvestParseGraph");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestParseGraph");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);//  w ww .ja  v  a  2 s.  co  m

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    // required parameters
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf);

    // optional parameters
    String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf);
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf);

    sLogger.info("Tool name: ProcessStanfordNLP");
    sLogger.info(" - Input path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    if (suTime != null && Boolean.parseBoolean(suTime)) {
        sLogger.info("- SUTime enabled");
    }//from w w  w . j av a  2s . c o  m

    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        sLogger.info("- Text output format enabled");
        textOutputFormat = true;
    }

    Job job = new Job(conf);
    job.setJobName("ProcessStanfordNLP");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StanfordParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StanfordParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(ProcessStanfordNLP.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue());

    return 0;
}