Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput.

Prototype

public static void setCompressOutput(Job job, boolean compress)

Source Link

Document

Set whether the output of the job is compressed.

Usage

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.PagesByURLExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    for (Map.Entry<String, String> next : job.getConfiguration()) {
        System.out.println(next.getKey() + ": " + next.getValue());
    }//from www.  j a  v  a2 s .c om

    job.setJarByClass(PagesByURLExtractor.class);
    job.setJobName(PagesByURLExtractor.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // input
    job.setInputFormatClass(WARCInputFormat.class);

    // output
    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setCompressOutput(job, true);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    // load IDs to be searched for
    job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2]));

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.URIExtractor.java

License:Apache License

/**
 * {@inheritDoc}/*from   w  ww  .  ja v  a2s. co m*/
 */
@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(getConf());
    // set from the command line
    job.setJarByClass(URIExtractor.class);
    job.setJobName(URIExtractor.class.getName());

    // mapper
    job.setMapperClass(URIExtractorMapper.class);
    job.setReducerClass(URIExtractorReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // is necessary, so that Hadoop does not mix the map input format up.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.experiments.dip.hadoop.ClueWebTRECIdFileExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    for (Map.Entry<String, String> next : job.getConfiguration()) {
        System.out.println(next.getKey() + ": " + next.getValue());
    }// w w w .  ja v a 2s .c  o  m

    job.setJarByClass(ClueWebTRECIdFileExtractor.class);
    job.setJobName(ClueWebTRECIdFileExtractor.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // input
    job.setInputFormatClass(WARCInputFormat.class);

    // output
    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setCompressOutput(job, true);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    // load IDs to be searched for
    job.getConfiguration().set(MAPREDUCE_MAPPER_TREC_IDS, loadTrecIds(args[2]));

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass",
            conf);/*from   w ww .  j a va 2 s  . c o  m*/
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs",
            conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf);

    sLogger.info("Tool name: HarvestContextPatternPairs");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestContextPatternPairs");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    // required parameters
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf);

    // optional parameters
    String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf);
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf);

    sLogger.info("Tool name: ProcessStanfordNLP");
    sLogger.info(" - Input path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    if (suTime != null && Boolean.parseBoolean(suTime)) {
        sLogger.info("- SUTime enabled");
    }/* ww  w. j a v  a 2 s. co  m*/

    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        sLogger.info("- Text output format enabled");
        textOutputFormat = true;
    }

    Job job = new Job(conf);
    job.setJobName("ProcessStanfordNLP");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StanfordParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StanfordParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(ProcessStanfordNLP.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue());

    return 0;
}

From source file:edu.isi.mavuno.app.nlp.TratzParse.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf);

    // optional parameter that allows the parsed documents to be output in text format
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf);
    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        textOutputFormat = true;/*from  w ww.j  av  a2  s .  co m*/
    }

    sLogger.info("Tool name: TratzParse");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("TratzParse");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TratzParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TratzParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(TratzParse.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue());
    sLogger.info(" - Total dropped sentences: "
            + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue());
    sLogger.info(
            " - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue());
    sLogger.info(
            " - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue());
    sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue());
    sLogger.info(" - Total named entity tagging time (ms): "
            + counters.findCounter(StatCounters.NETAG_TIME).getValue());
    sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue());

    return 0;
}

From source file:edu.isi.mavuno.extract.CombineGlobalStats.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.OutputPath", conf);
    int numSplits = conf.getInt("Mavuno.CombineGlobalStats.TotalSplits", 1);

    sLogger.info("Tool name: CombineGlobalStats");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Number of splits: " + numSplits);

    Job job = new Job(conf);
    job.setJobName("CombineGlobalStats");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(inputPath + "/" + split));
    }/* w w w.  j a v  a2s  . co m*/
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.extract.CombineSplits.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf);
    String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf);
    String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase();
    int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf);

    sLogger.info("Tool name: CombineSplits");
    sLogger.info(" - Examples path: " + examplesPath);
    sLogger.info(" - Example stats path: " + exampleStatsPath);
    sLogger.info(" - Split key: " + splitKey);
    sLogger.info(" - Total splits: " + numSplits);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("CombineSplits");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split));
    }/*from w ww .j  a va 2 s.  c  om*/

    if (MavunoUtils.pathExists(conf, exampleStatsPath)) {
        FileInputFormat.addInputPath(job, new Path(exampleStatsPath));
    }

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);

    if ("pattern".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else if ("context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    } else if ("pattern+context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else {
        throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey);
    }

    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.extract.Extract.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase();
    int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf));
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf);

    sLogger.info("Tool name: Extract");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor arguments: " + extractorArgs);
    sLogger.info(" - Extractor target: " + extractorTarget);
    sLogger.info(" - Min context matches: " + minContextMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("Extract");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    if ("pattern".equals(extractorTarget)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
    } else if ("context".equals(extractorTarget)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
        job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class);
    } else {/*from w w  w. j  a va 2 s . co m*/
        throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget);
    }

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf)
            .toLowerCase();//from w ww . j a v a  2 s  .c o m
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf);

    // split examples
    conf.set("Mavuno.Split.InputPath", inputPath);
    conf.set("Mavuno.Split.OutputPath", outputPath + "/../split");
    conf.set("Mavuno.Split.SplitKey", extractorTarget);
    new Split(conf).run();

    // get splits
    FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split");
    int split = 0;
    for (FileStatus file : files) {
        if (!file.getPath().getName().endsWith(".examples")) {
            continue;
        }

        conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString());

        sLogger.info("Tool name: ExtractGlobalStats");
        sLogger.info(" - Input path: " + inputPath);
        sLogger.info(" - Examples path: " + file.getPath());
        sLogger.info(" - Example split: " + split);
        sLogger.info(" - Corpus path: " + corpusPath);
        sLogger.info(" - Corpus class: " + corpusClass);
        sLogger.info(" - Extractor class: " + extractorClass);
        sLogger.info(" - Extractor class: " + extractorArgs);
        sLogger.info(" - Extractor target: " + extractorTarget);
        sLogger.info(" - Output path: " + outputPath);

        Job job = new Job(conf);
        job.setJobName("ExtractGlobalStats");

        MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split));

        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        job.setMapOutputKeyClass(ContextPatternWritable.class);
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
        job.setMapOutputValueClass(ContextPatternStatsWritable.class);

        job.setOutputKeyClass(ContextPatternWritable.class);
        job.setOutputValueClass(ContextPatternStatsWritable.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.waitForCompletion(true);

        split++;
    }

    // combine splits
    conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split);
    conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/");
    conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath);
    new CombineGlobalStats(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/../split");

    return 0;
}