Example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType.

Prototype

public static void setOutputCompressionType(Job job, CompressionType style) 

Source Link

Document

Set the CompressionType for the output SequenceFile .

Usage

From source file:org.apache.mahout.math.hadoop.stochasticsvd.YtYJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, Path outputPath, int k, int p, long seed)
        throws ClassNotFoundException, InterruptedException, IOException {

    Job job = new Job(conf);
    job.setJobName("YtY-job");
    job.setJarByClass(YtYJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(YtYMapper.class);

    job.getConfiguration().setLong(PROP_OMEGA_SEED, seed);
    job.getConfiguration().setInt(PROP_K, k);
    job.getConfiguration().setInt(PROP_P, p);

    /*//from   w  w w  .  j a  va 2 s  .  c  om
     * we must reduce to just one matrix which means we need only one reducer.
     * But it's ok since each mapper outputs only one vector (a packed
     * UpperTriangular) so even if there're thousands of mappers, one reducer
     * should cope just fine.
     */
    job.setNumReduceTasks(1);

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("YtY job unsuccessful.");
    }

}

From source file:org.apache.sqoop.mapreduce.ImportJobBase.java

License:Apache License

/**
 * Configure the output format to use for the job.
 *//*w  w  w.ja  v a2 s.c om*/
@Override
protected void configureOutputFormat(Job job, String tableName, String tableClassName)
        throws ClassNotFoundException, IOException {

    job.setOutputFormatClass(getOutputFormatClass());

    if (isHCatJob) {
        LOG.debug("Configuring output format for HCatalog  import job");
        SqoopHCatUtilities.configureImportOutputFormat(options, job, getContext().getConnManager(), tableName,
                job.getConfiguration());
        return;
    }

    if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
        job.getConfiguration().set("mapred.output.value.class", tableClassName);
    }

    if (options.shouldUseCompression()) {
        FileOutputFormat.setCompressOutput(job, true);

        String codecName = options.getCompressionCodec();
        Class<? extends CompressionCodec> codecClass;
        if (codecName == null) {
            codecClass = GzipCodec.class;
        } else {
            Configuration conf = job.getConfiguration();
            codecClass = CodecMap.getCodec(codecName, conf).getClass();
        }
        FileOutputFormat.setOutputCompressorClass(job, codecClass);

        if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) {
            SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
        }

        // SQOOP-428: Avro expects not a fully qualified class name but a "short"
        // name instead (e.g. "snappy") and it needs to be set in a custom
        // configuration option called "avro.output.codec".
        // The default codec is "deflate".
        if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) {
            if (codecName != null) {
                String shortName = CodecMap.getCodecShortNameByName(codecName, job.getConfiguration());
                // Avro only knows about "deflate" and not "default"
                if (shortName.equalsIgnoreCase("default")) {
                    shortName = "deflate";
                }
                job.getConfiguration().set(AvroJob.OUTPUT_CODEC, shortName);
            } else {
                job.getConfiguration().set(AvroJob.OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC);
            }
        }
    }

    Path outputPath = context.getDestination();
    FileOutputFormat.setOutputPath(job, outputPath);
}

From source file:org.opensextant.mapreduce.XponentsTaggerDemo.java

License:Apache License

/**
 * Returns 0 = SUCCESS, other than 0 is a FAILURE mode.
 * //from   w w w.  jav  a  2s.  c  om
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    /*
     * Run:
     *    /input/path  /output/path  phase CC  yyyymmdd:yyyymmdd
     *    
     *    phase = geotag | xtax
     */
    LongOpt[] options = { new LongOpt("in", LongOpt.REQUIRED_ARGUMENT, null, 'i'),
            new LongOpt("out", LongOpt.REQUIRED_ARGUMENT, null, 'o'),
            new LongOpt("phase", LongOpt.REQUIRED_ARGUMENT, null, 'p'),
            new LongOpt("cc", LongOpt.REQUIRED_ARGUMENT, null, 'c'),
            new LongOpt("date-range", LongOpt.REQUIRED_ARGUMENT, null, 'd'),
            new LongOpt("utc-range", LongOpt.REQUIRED_ARGUMENT, null, 'u'),
            new LongOpt("log4j-extra-config", LongOpt.REQUIRED_ARGUMENT, null, 'L') };

    gnu.getopt.Getopt opts = new gnu.getopt.Getopt("socgeo", args, "", options);
    String inPath = null;
    String outPath = null;
    String phase = null;
    String cc = null;
    String dateRange = null;
    String utcRange = null;
    String xponentsArchive = null;
    String log4jExtraConfig = null;

    System.out.println(Arrays.toString(args));
    int c;
    while ((c = opts.getopt()) != -1) {
        switch (c) {

        case 0:
            // 0 = Long opt processed.
            break;

        case 'i':
            inPath = opts.getOptarg();
            break;

        case 'o':
            outPath = opts.getOptarg();
            break;

        case 'p':
            phase = opts.getOptarg();
            break;

        case 'c':
            cc = opts.getOptarg();
            break;

        case 'd':
            dateRange = opts.getOptarg();
            break;

        case 'u':
            utcRange = opts.getOptarg();
            break;

        case 'L':
            log4jExtraConfig = opts.getOptarg();
            break;

        default:
            return -2;

        }
    }

    /* Helper resources -- possibly just replaced by existing SocGeoBase utilities.
     */
    XponentsTaggerDemo.initResources();

    /* Job App Configuration 
     */
    Job job = Job.getInstance(getConf(), "Xponents-Tagging");
    job.setJarByClass(XponentsTaggerDemo.class);

    // REQUIRED:
    if (cc == null || phase == null) {
        System.err.println("Phase and CC must be set");
        return -2;
    }
    job.getConfiguration().set("phase", phase);
    job.getConfiguration().set("country", cc);
    if (dateRange != null) {
        job.getConfiguration().set("date-range", dateRange);
    }
    if (utcRange != null) {
        job.getConfiguration().set("utc-range", utcRange);
    }
    /* Mapper */
    if (phase.equalsIgnoreCase("geotag")) {
        job.setMapperClass(GeoTaggerMapper.class);
    } else if (phase.equalsIgnoreCase("xtax")) {
        job.setMapperClass(KeywordTaggerMapper.class);
    }

    if (log4jExtraConfig != null) {
        job.getConfiguration().set(LOG4J_SUPPLEMENTARY_CONFIGURATION, log4jExtraConfig);
    }

    /* No Reduce step */
    job.setNumReduceTasks(1);

    /* Job Input */
    job.setInputFormatClass(SequenceFileInputFormat.class);

    /* Map Phase ouptut */
    SequenceFileInputFormat.addInputPaths(job, inPath);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);

    /* Job Output */
    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().set("mapreduce.map.output.compress.codec",
            "org.apache.hadoop.io.compress.SnappyCodec");
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setOutputPath(job, new Path(outPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    job.getConfiguration().set("mapreduce.output.fileoutputformat.compress.codec",
            "org.apache.hadoop.io.compress.SnappyCodec");

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJobSampling() throws IOException, ClassNotFoundException, InterruptedException {
    boolean jobOK;
    Job job = null;/*from   w  w  w .  ja  va 2 s .c o m*/

    // if input path does not exists, fail
    if (!this.inputFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
        System.exit(-1);
    }

    // if samples path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionarySamplesPath())) {
        if (this.conf.getDeleteDictionarySamplesPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionarySamplesPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary samples path does exist: " + this.conf.getDictionarySamplesPath());
            System.out.println("Select other path or use option -ds to overwrite");
            System.exit(-1);
        }
    }

    // Job to create a SequenceInputFormat with Roles
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 1");
    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("input = " + this.conf.getInputPath());
    System.out.println("samples = " + this.conf.getDictionarySamplesPath());

    FileInputFormat.addInputPath(job, this.conf.getInputPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionarySamplesPath());

    job.setInputFormatClass(LzoTextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setMapperClass(DictionarySamplerMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setCombinerClass(DictionarySamplerReducer.class);
    job.setReducerClass(DictionarySamplerReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(this.conf.getDictionarySampleReducers());

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJob()
        throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;//  ww  w  . j av a2 s  .  com
    BufferedWriter bufferedWriter;

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);
        }
    }

    // Sample the SequenceInputFormat to do TotalSort and create final output
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("samples = " + this.conf.getDictionarySamplesPath());
    System.out.println("output = " + this.conf.getDictionaryOutputPath());

    FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    // Identity Mapper
    // job.setMapperClass(Mapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException {
    Job job = null;//from w ww.j a v  a  2s .c o  m
    boolean jobOK;
    BufferedWriter bufferedWriter;

    // if input path does not exists, fail
    if (!this.inputFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
        System.exit(-1);
    }

    // if dictionary output path does not exists, fail
    if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
        System.exit(-1);
    }

    // if samples path exists, fail
    if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) {
        if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option
            // provided, delete
            // recursively
            this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath());
            System.out.println("Select other path or use option -dst to overwrite");
            System.exit(-1);
        }
    }

    this.conf.setProperty("mapred.child.java.opts",
            "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");

    // Job to create a SequenceInputFormat
    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getInputPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath());

    job.setInputFormatClass(LzoTextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setMapperClass(TriplesSPOMapper.class);
    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);
    job.setMapOutputKeyClass(TripleSPOWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
    bufferedWriter.write(this.numTriples.toString() + "\n");
    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJob()
        throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Job job = null;//w  w  w .  j  a v  a  2  s  . c om
    boolean jobOK;

    // if triples output path exists...
    if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
        if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
            this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
            System.out.println("Select other path or use option -dt to overwrite");
            System.exit(-1);
        }
    }

    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    return jobOK;
}

From source file:wikiduper.clir.rp.RepackText.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from   w ww.j a  va2  s. c  o m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackText.class);
    job.setJobName(String.format("RepackText[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    SequenceFileInputFormat.addInputPath(job, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        SequenceFileOutputFormat.setCompressOutput(job, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(TextDocument.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}