Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException 

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java

License:Apache License

@Test
public void test1() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
        IllegalAccessException, TupleMRException {

    String input = TEST_OUT + "/input";
    String output = TEST_OUT + "/output";

    String[] inputElements = new String[] { "ES 20 listo 250", "US 14 beber 202", "US 14 perro 180",
            "US 14 perro 170", "US 15 jauja 160", "US 16 listo 160", "XE 20 listo 230" };

    Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int"));
    ITuple[] tuples = new ITuple[inputElements.length];
    int i = 0;/*from  w w  w  . j  av a2s.  c  om*/
    for (String inputElement : inputElements) {
        withInput(input, writable(inputElement));
        tuples[i++] = createTuple(inputElement, schema);
    }
    Path outputPath = new Path(output);

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("country", "age", "name");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC)
            .add("height", Order.DESC));
    builder.setRollupFrom("country");
    builder.setTupleReducer(new IdentityRed());
    builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
            Text.class);
    builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map());

    Job job = builder.createJob();
    try {
        job.setNumReduceTasks(1);
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    FileSystem fs = FileSystem.get(getConf());
    Path outputFile = new Path(output + "/part-r-00000");
    checkRollupOutput(outputFile, 0, 2);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, getConf());

    assertOutput(reader, "OPEN 0", tuples[0]);
    assertOutput(reader, "OPEN 1", tuples[0]);
    assertOutput(reader, "OPEN 2", tuples[0]);
    assertOutput(reader, "ELEMENT", tuples[0]);
    assertOutput(reader, "CLOSE 2", tuples[0]);
    assertOutput(reader, "CLOSE 1", tuples[0]);
    assertOutput(reader, "CLOSE 0", tuples[0]);

    assertOutput(reader, "OPEN 0", tuples[1]);
    assertOutput(reader, "OPEN 1", tuples[1]);
    assertOutput(reader, "OPEN 2", tuples[1]);
    assertOutput(reader, "ELEMENT", tuples[1]);
    assertOutput(reader, "CLOSE 2", tuples[1]);

    assertOutput(reader, "OPEN 2", tuples[2]);
    assertOutput(reader, "ELEMENT", tuples[2]);
    assertOutput(reader, "ELEMENT", tuples[3]);
    assertOutput(reader, "CLOSE 2", tuples[3]);
    assertOutput(reader, "CLOSE 1", tuples[3]);

    assertOutput(reader, "OPEN 1", tuples[4]);
    assertOutput(reader, "OPEN 2", tuples[4]);
    assertOutput(reader, "ELEMENT", tuples[4]);
    assertOutput(reader, "CLOSE 2", tuples[4]);
    assertOutput(reader, "CLOSE 1", tuples[4]);

    assertOutput(reader, "OPEN 1", tuples[5]);
    assertOutput(reader, "OPEN 2", tuples[5]);
    assertOutput(reader, "ELEMENT", tuples[5]);
    assertOutput(reader, "CLOSE 2", tuples[5]);
    assertOutput(reader, "CLOSE 1", tuples[5]);
    assertOutput(reader, "CLOSE 0", tuples[5]);

    assertOutput(reader, "OPEN 0", tuples[6]);
    assertOutput(reader, "OPEN 1", tuples[6]);
    assertOutput(reader, "OPEN 2", tuples[6]);
    assertOutput(reader, "ELEMENT", tuples[6]);
    assertOutput(reader, "CLOSE 2", tuples[6]);
    assertOutput(reader, "CLOSE 1", tuples[6]);
    assertOutput(reader, "CLOSE 0", tuples[6]);

    reader.close();
    cleanUp();
    trash(TEST_OUT);
}

From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java

License:Apache License

@Test
public void test2() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
        IllegalAccessException, TupleMRException {

    String input = TEST_OUT + "/input";
    String output = TEST_OUT + "/output";

    String[] inputElements = new String[] { "ES 20 listo 250", "US 14 beber 202", "US 14 perro 180",
            "US 14 perro 170", "US 15 jauja 160", "US 16 listo 160", "XE 16 listo 230" };

    Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int"));
    ITuple[] tuples = new ITuple[inputElements.length];
    int i = 0;/*from   w w w  .java2  s  .co  m*/
    for (String inputElement : inputElements) {
        withInput(input, writable(inputElement));
        tuples[i++] = createTuple(inputElement, schema);
    }
    Path outputPath = new Path(output);

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("age", "name", "country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC)
            .add("height", Order.DESC));
    builder.setRollupFrom("age");
    builder.setTupleReducer(new IdentityRed());
    builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
            Text.class);
    builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map());

    Job job = builder.createJob();
    try {
        job.setNumReduceTasks(1);
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    FileSystem fs = FileSystem.get(getConf());
    Path outputFile = new Path(output + "/part-r-00000");
    checkRollupOutput(outputFile, 1, 2);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, getConf());

    assertOutput(reader, "OPEN 1", tuples[0]);
    assertOutput(reader, "OPEN 2", tuples[0]);
    assertOutput(reader, "ELEMENT", tuples[0]);
    assertOutput(reader, "CLOSE 2", tuples[0]);
    assertOutput(reader, "CLOSE 1", tuples[0]);

    assertOutput(reader, "OPEN 1", tuples[1]);
    assertOutput(reader, "OPEN 2", tuples[1]);
    assertOutput(reader, "ELEMENT", tuples[1]);
    assertOutput(reader, "CLOSE 2", tuples[1]);

    assertOutput(reader, "OPEN 2", tuples[2]);
    assertOutput(reader, "ELEMENT", tuples[2]);
    assertOutput(reader, "ELEMENT", tuples[3]);
    assertOutput(reader, "CLOSE 2", tuples[3]);
    assertOutput(reader, "CLOSE 1", tuples[3]);

    assertOutput(reader, "OPEN 1", tuples[4]);
    assertOutput(reader, "OPEN 2", tuples[4]);
    assertOutput(reader, "ELEMENT", tuples[4]);
    assertOutput(reader, "CLOSE 2", tuples[4]);
    assertOutput(reader, "CLOSE 1", tuples[4]);

    assertOutput(reader, "OPEN 1", tuples[5]);
    assertOutput(reader, "OPEN 2", tuples[5]);
    assertOutput(reader, "ELEMENT", tuples[5]);
    assertOutput(reader, "CLOSE 2", tuples[5]);
    assertOutput(reader, "CLOSE 1", tuples[5]);

    assertOutput(reader, "OPEN 1", tuples[6]);
    assertOutput(reader, "OPEN 2", tuples[6]);
    assertOutput(reader, "ELEMENT", tuples[6]);
    assertOutput(reader, "CLOSE 2", tuples[6]);
    assertOutput(reader, "CLOSE 1", tuples[6]);

    reader.close();
    cleanUp();
    trash(TEST_OUT);
}

From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java

License:Apache License

/**
 * Tests the case in which the reducer receives no data.
 */// w w  w. ja  v a  2  s  .  c o m
@Test
public void testNoDataReducer() throws IOException, InterruptedException, ClassNotFoundException,
        InstantiationException, IllegalAccessException, TupleMRException {

    String input = TEST_OUT + "/input";
    String output = TEST_OUT + "/output";

    withInput(input, writable("ES 20 listo 250"));

    Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int"));
    Path outputPath = new Path(output);

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("age", "name", "country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC));
    builder.setRollupFrom("age");
    builder.setTupleReducer(new IdentityRed());
    builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
            Text.class);
    builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new DoNothingMap());

    Job job = builder.createJob();
    try {
        job.setNumReduceTasks(1);
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    cleanUp();
    trash(TEST_OUT);
}

From source file:com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob.java

License:Apache License

@Test
public void testFillingTuplesJob()
        throws IOException, ClassNotFoundException, InterruptedException, TupleMRException {
    int NUM_ROWS_TO_GENERATE = 100;

    Configuration conf = getConf();
    String input = TestTupleMRJob.class + "-input";
    String output = TestTupleMRJob.class + "-output";

    ITuple tuple = new Tuple(SCHEMA);
    for (int i = 0; i < NUM_ROWS_TO_GENERATE; i++) {
        withTupleInput(input, fillTuple(true, tuple));
    }//w  ww  .ja v a 2s  .  c  o  m

    TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test");
    builder.addTupleInput(new Path(input), new TupleMapper<ITuple, NullWritable>() {

        @Override
        public void map(ITuple iTuple, NullWritable nullWritable, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException {
            collector.write(fillTuple(true, iTuple));
        }
    });
    builder.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {
        @Override
        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {
            for (ITuple tuple : tuples) {
                collector.write(fillTuple(true, tuple), NullWritable.get());
            }
        }
    });
    builder.addIntermediateSchema(SCHEMA);
    builder.setGroupByFields(SCHEMA.getField(0).getName());
    builder.setTupleOutput(new Path(output), SCHEMA);

    Job job = builder.createJob();
    job.setNumReduceTasks(1);
    try {
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    final AtomicInteger count = new AtomicInteger();
    readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() {

        @Override
        public void onTuple(ITuple tuple) {
            count.incrementAndGet();
        }
    });

    assertEquals(NUM_ROWS_TO_GENERATE, count.get());

    trash(input);
    trash(output);
}

From source file:com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob.java

License:Apache License

@Test
public void testJobWithNulls()
        throws IOException, TupleMRException, ClassNotFoundException, InterruptedException {
    Configuration conf = getConf();
    String input1 = TestTupleMRJob.class.getCanonicalName() + "-input1";
    String input2 = TestTupleMRJob.class.getCanonicalName() + "-input2";
    String output = TestTupleMRJob.class.getCanonicalName() + "-output";

    final Schema schemaNoNulls = new Schema("NoNulls", Fields.parse("f1:int,f2:string"));
    final Schema schemaNulls = new Schema("Nulls", Fields.parse("f1:int?,f2:string?"));
    Tuple t1 = new Tuple(schemaNoNulls);
    Tuple t2 = new Tuple(schemaNulls);

    t1.set(0, 0);//from   www . j  a v  a 2 s  . com
    t1.set(1, "nn");
    withTupleInput(input1, t1);

    Object tuples[][] = new Object[][] { new Object[] { 0, null }, new Object[] { 0, "n1" },
            new Object[] { null, "n2" } };
    for (Object[] tuple : tuples) {
        t2.set(0, tuple[0]);
        t2.set(1, tuple[1]);
        withTupleInput(input2, t2);
    }

    TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test");
    builder.addTupleInput(new Path(input1), new IdentityTupleMapper());
    builder.addTupleInput(new Path(input2), new IdentityTupleMapper());

    builder.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {
        @Override
        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {
            int count = 0;
            for (ITuple tuple : tuples) {
                Tuple t = new Tuple(schemaNulls);
                t.set(0, tuple.get(0));
                t.set(1, tuple.get(1));
                collector.write(t, NullWritable.get());
                count++;
            }
            if (group.get(0) == null) {
                assertEquals(1, count);
            } else if (((Integer) group.get(0)) == 0) {
                assertEquals(3, count);
            }
        }
    });
    builder.addIntermediateSchema(schemaNoNulls);
    builder.addIntermediateSchema(schemaNulls);
    builder.setGroupByFields("f1");
    builder.setOrderBy(OrderBy.parse("f1:desc|null_smallest").addSchemaOrder(Criteria.Order.ASC));
    builder.setSpecificOrderBy("NoNulls", OrderBy.parse("f2:asc|null_biggest"));
    builder.setSpecificOrderBy("Nulls", OrderBy.parse("f2:asc|null_biggest"));
    builder.setTupleOutput(new Path(output), schemaNulls);

    Job job = builder.createJob();
    job.setNumReduceTasks(1);
    try {
        assertRun(job);
    } finally {
        builder.cleanUpInstanceFiles();
    }

    final Object expectedOutput[][] = new Object[][] { new Object[] { 0, "nn" }, new Object[] { 0, "n1" },
            new Object[] { 0, null }, new Object[] { null, "n2" } };

    boolean debug = false;
    if (debug) {
        readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() {
            @Override
            public void onTuple(ITuple t) {
                System.out.println(t);
            }
        });
    }

    readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() {
        int i = 0;

        @Override
        public void onTuple(ITuple t) {
            assertEqualsNull(expectedOutput[i][0], t.get(0));
            Object f2 = t.get(1);
            f2 = (f2 != null) ? f2.toString() : f2;
            assertEqualsNull(expectedOutput[i][1], f2);
            i++;
        }
    });

    trash(input1);
    trash(input2);
    trash(output);
}

From source file:com.declum.squzer.example.hbase.table2file.Export.java

License:Apache License

/**
 * Sets up the actual job./*from  w  w w  .  j a  va  2  s . co m*/
 * 
 * @param conf
 *            The current configuration.
 * @param args
 *            The command line parameters.
 * @return The newly created job.
 * @throws IOException
 *             When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    Path outputDir = new Path(args[1]);

    Job job = Job.getInstance(conf);
    job.setJobName(tableName);
    job.setJobName(NAME + "_" + tableName);
    job.setJarByClass(Exporter.class);
    // TODO: Allow passing filter and subset of rows/columns.
    Scan s = new Scan();
    // Optional arguments.
    int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1;
    s.setMaxVersions(versions);
    long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L;
    long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE;
    s.setTimeRange(startTime, endTime);
    s.setCacheBlocks(false);
    if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
        s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
    }
    LOG.info("verisons=" + versions + ", starttime=" + startTime + ", endtime=" + endTime);
    TableMapReduceUtil.initTableMapperJob(tableName, s, Exporter.class, null, null, job);
    // No reducers. Just write straight to output files.
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Result.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    return job;
}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * /* w ww  .j  a  v  a 2  s. com*/
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param type
 *            The annotation type representing the tokens
 * @param feature
 *            The name of the features holding the token value
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void tokenizeDocuments(Path input, String type, String feature, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(TOKEN_TYPE, type);
    conf.set(FEATURE_NAME, feature);

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::BehemothTokenizer: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(BehemothTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * /*  w  w w. j  a  va 2  s .c om*/
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param analyzerClass
 *            The Lucene {@link Analyzer} for tokenizing the UTF-8 text
 */
public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output,
        Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(ANALYZER_CLASS, analyzerClass.getName());

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::LuceneTokenizer: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(LuceneTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");

}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

public static void dumpLabels(Path input, Path output, Configuration baseConf)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf
    // values/* w  w w. j a  v a 2  s. c o  m*/
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::LabelDumper: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(BehemothLabelMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");

}

From source file:com.digitalpebble.behemoth.mahout.DocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * /*  w  w  w . j a v a 2  s  . co  m*/
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param type
 *            The annotation type representing the tokens
 * @param feature
 *            The name of the features holding the token value
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void tokenizeDocuments(Path input, String type, String feature, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(TOKEN_TYPE, type);
    conf.set(FEATURE_NAME, feature);

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input);
    job.setJarByClass(DocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(SequenceFileTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    job.waitForCompletion(true);
}