Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:csc555.ebratt.depaul.edu.GildPercentDriverPass1.java

License:Open Source License

/**
 * /*from w  w  w.  j  av  a  2s .  c  om*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the GildPercentReducerPass1.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    String groupBy = getConf().get("groupBy");
    StringBuffer sb = new StringBuffer();
    sb.append("gild percent of: ");
    sb.append(groupBy);
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // testing -- ensure each node gets 2 reducers
    JobConf jobConf = new JobConf(getConf(), GildPercentDriverPass1.class);
    JobClient jobClient = new JobClient(jobConf);
    ClusterStatus cluster = jobClient.getClusterStatus();
    job.setNumReduceTasks(cluster.getTaskTrackers() * 2);

    // Mapper and Reducer Classes to use
    job.setMapperClass(GildPercentMapperPass1.class);
    job.setReducerClass(GildPercentReducerPass1.class);

    // Mapper output classes
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(GildPercentReducerPass1.class);
    }

    // The Jar file to run
    job.setJarByClass(GildPercentDriverPass1.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.GildPercentDriverPass2.java

License:Open Source License

/**
 * /*from  w  ww  .j ava2  s.  c o  m*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the GildPercentReducerPass2.class as
 *            the combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted gild percent");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(GildPercentMapperPass2.class);
    job.setReducerClass(GildPercentReducerPass2.class);

    // Mapper output classes
    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(GildPercentReducerPass2.class);
    }

    // sort in descending order
    job.setSortComparatorClass(DoubleWritableDescendingComparator.class);

    // The Jar file to run
    job.setJarByClass(GildPercentDriverPass2.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.RCTop10Driver.java

License:Open Source License

/**
 * //from w ww. ja  va  2s  . co  m
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
public int run(String[] args) throws Exception {

    Job job = new Job(getConf(), "Top 10 Reddit");

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // ensure 1 reduce tasks for ranking
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(RCTop10Mapper.class);
    job.setReducerClass(RCTop10Reducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(GroupByCountPair.class);
    job.setMapOutputValueClass(Text.class);

    // set custom partitioner
    job.setPartitionerClass(GroupByCountPairPartitioner.class);

    // set custom grouping comparator
    job.setGroupingComparatorClass(GroupByGroupingComparator.class);

    // input class
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(GroupByCountPair.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // The Jar file to run
    job.setJarByClass(RCTop10Driver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.RCWordCountAcronymsDriver.java

License:Open Source License

/**
 * // w  w  w  .  j  a v a  2s.c  o m
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [3] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the RCWordCountReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    String aggregate = getConf().get("aggregate");
    String groupBy = getConf().get("groupBy");
    StringBuffer sb = new StringBuffer();
    sb.append("count of acronyms in: ");
    sb.append(aggregate);
    sb.append("; grouped by: ");
    sb.append(groupBy);
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // testing -- ensure each node gets 2 reducers
    JobConf jobConf = new JobConf(getConf(), RCWordCountAcronymsDriver.class);
    JobClient jobClient = new JobClient(jobConf);
    ClusterStatus cluster = jobClient.getClusterStatus();
    job.setNumReduceTasks(cluster.getTaskTrackers() * 2);

    // Mapper and Reducer Classes to use
    job.setMapperClass(RCWordCountMapper.class);
    job.setReducerClass(RCWordCountReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[3].equals("yes")) {
        job.setCombinerClass(RCWordCountReducer.class);
    }

    // The Jar file to run
    job.setJarByClass(RCWordCountAcronymsDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.RCWordCountDriver.java

License:Open Source License

/**
 * /* ww w. j  av  a 2  s. c  om*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [3] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the RCWordCountReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    String aggregate = getConf().get("aggregate");
    String groupBy = getConf().get("groupBy");
    StringBuffer sb = new StringBuffer();
    sb.append("count of: ");
    sb.append(aggregate);
    sb.append("; grouped by: ");
    sb.append(groupBy);
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // testing -- ensure each node gets 2 reducers
    JobConf jobConf = new JobConf(getConf(), RCWordCountDriver.class);
    JobClient jobClient = new JobClient(jobConf);
    ClusterStatus cluster = jobClient.getClusterStatus();
    job.setNumReduceTasks(cluster.getTaskTrackers() * 5);

    // Mapper and Reducer Classes to use
    job.setMapperClass(RCWordCountMapper.class);
    job.setReducerClass(RCWordCountReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[3].equals("yes")) {
        job.setCombinerClass(RCWordCountReducer.class);
    }

    // The Jar file to run
    job.setJarByClass(RCWordCountDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.VoteCountDriver.java

License:Open Source License

/**
 * /*from   w w w .ja va2  s . c  om*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the VoteCountReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    String groupBy = getConf().get("groupBy");
    StringBuffer sb = new StringBuffer();
    sb.append("count of votes grouped by: ");
    sb.append(groupBy);
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // testing -- ensure each node gets 2 reducers
    JobConf jobConf = new JobConf(getConf(), VoteCountDriver.class);
    JobClient jobClient = new JobClient(jobConf);
    ClusterStatus cluster = jobClient.getClusterStatus();
    job.setNumReduceTasks(cluster.getTaskTrackers() * 2);

    // Mapper and Reducer Classes to use
    job.setMapperClass(VoteCountMapper.class);
    job.setReducerClass(LongSumReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(LongSumReducer.class);
    }

    // The Jar file to run
    job.setJarByClass(VoteCountDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.VoteSorterDriver.java

License:Open Source License

/**
 * /*from   w w  w . j a  v  a2s. c  o  m*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the VoteSorterReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted vote counts");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(VoteSorterMapper.class);
    job.setReducerClass(VoteSorterReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(VoteSorterReducer.class);
    }

    // sort in descending order
    job.setSortComparatorClass(LongWritable.DecreasingComparator.class);

    // The Jar file to run
    job.setJarByClass(VoteSorterDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:DAAL.CovarianceDense.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();

    /* Put shared libraries into the distributed cache */
    DistributedCache.createSymlink(conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libJavaAPI.so#libJavaAPI.so"), conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libtbb.so.2#libtbb.so.2"), conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libiomp5.so#libiomp5.so"), conf);

    Job job = new Job(conf, "CovarianceDense Job");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CovarianceDenseStep1Mapper.class);
    job.setReducerClass(CovarianceDenseStep2Reducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WriteableData.class);

    job.setJarByClass(CovarianceDense.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:DAAL.CovarianceSparse.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();

    /* Put shared libraries into the distributed cache */
    DistributedCache.createSymlink(conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libJavaAPI.so#libJavaAPI.so"), conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libtbb.so.2#libtbb.so.2"), conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libiomp5.so#libiomp5.so"), conf);

    Job job = new Job(conf, "CovarianceSparse Job");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CovarianceSparseStep1Mapper.class);
    job.setReducerClass(CovarianceSparseStep2Reducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WriteableData.class);

    job.setJarByClass(CovarianceSparse.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:DAAL.Kmeans.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();

    /* Put shared libraries into the distributed cache */
    DistributedCache.createSymlink(conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libJavaAPI.so#libJavaAPI.so"), conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libtbb.so.2#libtbb.so.2"), conf);
    DistributedCache.addCacheFile(new URI("/Hadoop/Libraries/libiomp5.so#libiomp5.so"), conf);

    Job initJob = new Job(conf, "K-means Init Job");

    FileInputFormat.setInputPaths(initJob, new Path(args[0]));
    FileOutputFormat.setOutputPath(initJob, new Path("/Hadoop/Kmeans/initResults"));

    initJob.setMapperClass(KmeansInitStep1Mapper.class);
    initJob.setReducerClass(KmeansInitStep2Reducer.class);
    initJob.setInputFormatClass(TextInputFormat.class);
    initJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    initJob.setOutputKeyClass(IntWritable.class);
    initJob.setOutputValueClass(WriteableData.class);
    initJob.setJarByClass(Kmeans.class);
    initJob.waitForCompletion(true);/*  ww  w.  j a v  a 2 s.  co  m*/

    int a = 1;

    for (int i = 0; i < nIterations; i++) {
        Job job = new Job(conf, "K-means compute Job");
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        if (i == nIterations - 1) {
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
        } else {
            FileOutputFormat.setOutputPath(job, new Path("/Hadoop/Kmeans/ResultsIter" + i));
        }
        job.setMapperClass(KmeansStep1Mapper.class);
        job.setReducerClass(KmeansStep2Reducer.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(WriteableData.class);
        job.setJarByClass(Kmeans.class);

        if (job.waitForCompletion(true)) {
            a = 0;
        } else {
            a = 1;
        }
        ;
    }

    return a;
}