Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

public static Job prepareJob(String inputPath, String outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
        Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
        Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException {

    //    Job job = new Job(new Configuration(conf));
    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }// w  w w  . j  a  va  2  s  . com
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath);

    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setReducerClass(reducer);
    job.setOutputKeyClass(reducerKey);
    job.setOutputValueClass(reducerValue);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath);

    return job;
}

From source file:com.github.sakserv.minicluster.mapreduce.Driver.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 2) {
        System.out.println("usage: [input] [output]");
        System.exit(-1);//from   www  .ja v  a2s .co m
    }

    if (null == configuration) {
        configuration = new Configuration();
    }

    Job job = Job.getInstance(configuration);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordMapper.class);
    job.setReducerClass(SumReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setJarByClass(Driver.class);

    job.waitForCompletion(true);

}

From source file:com.github.ygf.pagerank.InLinks.java

License:Apache License

private void computeInLinks(Configuration conf, Path linksFile, Path outputDir) throws Exception {

    // This job computes the number of in-links for every page. The
    // implementation is very similar to the classic word count example.

    Job job = Job.getInstance(conf, "InLinks:Computation");

    job.setJarByClass(InLinks.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(InLinksMapper.class);
    job.setCombinerClass(InLinksReducer.class);
    job.setReducerClass(InLinksReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, linksFile);
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks"));

    job.waitForCompletion(true);// w  ww. jav  a2s  .  c om
}

From source file:com.github.ygf.pagerank.InLinks.java

License:Apache License

private void summarizeResults(Configuration conf, Path outputDir) throws Exception {

    int topResults = Integer.parseInt(conf.get("inlinks.top_results"));

    Job job = Job.getInstance(conf, "InLinks:TopN");

    job.setJarByClass(InLinks.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(InLinksTopNMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(InLinksTopNReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "inlinks"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks-top" + topResults));

    job.setNumReduceTasks(1);//  www .j  a v a2 s. co  m
    job.waitForCompletion(true);
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void createTransitionMatrix(Configuration conf, Path linksFile, Path outputDir) throws Exception {

    // This job reads the links-simple-sorted.txt input file and generates
    // the corresponding transition matrix. The matrix is divided into
    // square blocks and each block is represented by the nonzero entries.
    // See Section 5.2 (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details.
    // The output is written to the "M" subdir in the output dir.

    Job job = Job.getInstance(conf, "PageRank:Matrix");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(PageRankMatrixMapper.class);
    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapreduce.map.output.compress.codec", DefaultCodec.class,
            CompressionCodec.class);
    job.setMapOutputKeyClass(ShortArrayWritable.class);
    job.setMapOutputValueClass(ShortArrayWritable.class);
    job.setReducerClass(PageRankMatrixReducer.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(ShortArrayWritable.class);
    job.setOutputValueClass(MatrixBlockWritable.class);
    FileInputFormat.addInputPath(job, linksFile);
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "M"));

    job.waitForCompletion(true);/* w ww .j a  v  a 2  s .  c  o m*/
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads 
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);//from  w  w w . j  a v  a2  s. c  o  m
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void summarizeResults(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job creates a plain text file with the top N PageRanks and the
    // titles of the pages. Each map task emits the top N PageRanks it
    // receives, and the reduce task merges the partial results into the
    // global top N PageRanks. A single reducer is used in the job in order
    // to have access to all the individual top N PageRanks from the
    // mappers. The reducer looks up the titles in the index built by
    // TitleIndex. This job was designed considering that N is small.

    int topResults = Integer.parseInt(conf.get("pagerank.top_results"));

    Job job = Job.getInstance(conf, "PageRank:TopN");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankTopNMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(PageRankTopNReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "v" + iter));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter + "-top" + topResults));

    job.setNumReduceTasks(1);/*from   w w w. j a v  a  2 s . c  o  m*/
    job.waitForCompletion(true);
}

From source file:com.github.ygf.pagerank.TitleIndex.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }/*from   w w w  .ja v a  2 s .  com*/

    Path titlesFile = new Path(args[0]);
    Path outputDir = new Path(args[1]);

    Configuration conf = getConf();

    // Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls
    // try to read the _SUCCESS as another MapFile dir.
    conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false");

    // This job creates a MapFile of the titles indexed by the page id.
    // UnsplittableTextInputFormat is used to ensure that the same map task
    // gets all the lines in the titlesFile and it can count the line
    // numbers. The number of reduce tasks is set to 0.

    Job job = Job.getInstance(conf, "TitleIndex");

    job.setJarByClass(InLinks.class);
    job.setInputFormatClass(UnsplittableTextInputFormat.class);
    job.setMapperClass(TitleIndexMapper.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, titlesFile);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setNumReduceTasks(0);
    job.waitForCompletion(true);

    return 0;
}

From source file:com.goldsaxfoundation.bigdata.Module5.SimpleMapReduce.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "wordcount");
    job.setJarByClass(SimpleMapReduce.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);//w  w w.  j a v  a 2s  .  c o  m
}

From source file:com.google.cloud.bigtable.mapreduce.Export.java

License:Apache License

/**
 * Sets up the actual job./*from   w  w w  .  java 2 s  . c  o  m*/
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws java.io.IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    conf.setIfUnset("hbase.client.connection.impl", BigtableConfiguration.getConnectionClass().getName());
    conf.setIfUnset(BigtableOptionsFactory.BIGTABLE_RPC_TIMEOUT_MS_KEY, "60000");
    conf.setBoolean(TableInputFormat.SHUFFLE_MAPS, true);

    String tableName = args[0];
    Path outputDir = new Path(args[1]);
    Job job = Job.getInstance(conf, NAME + "_" + tableName);
    job.setJobName(NAME + "_" + tableName);
    job.setJarByClass(Export.class);
    // Set optional scan parameters
    Scan s = getConfiguredScanForJob(conf, args);
    TableMapReduceUtil.initTableMapperJob(tableName, s, IdentityTableMapper.class, ImmutableBytesWritable.class,
            Result.class, job, false);
    // No reducers.  Just write straight to output files.
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Result.class);
    FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
    return job;
}