Example usage for org.apache.hadoop.mapreduce Job setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the job output data.

Usage

From source file:co.nubetech.hiho.merge.MergeJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    populateConfiguration(args);//from   w  w w.jav a2 s .  co  m
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }

    Class inputFormatClass = Class.forName(inputFormat);
    Class outputFormatClass = Class.forName(outputFormat);
    Class inputKeyClass = Class.forName(inputKeyClassName);
    Class inputValueClass = Class.forName(inputValueClassName);

    Configuration conf = getConf();
    conf.set(HIHOConf.MERGE_OLD_PATH, oldPath);
    conf.set(HIHOConf.MERGE_NEW_PATH, newPath);

    Job job = new Job(conf);
    job.setJobName("Merge job");
    job.setJarByClass(MergeJob.class);

    if (mergeBy.equals("key")) {
        job.setMapperClass(MergeKeyMapper.class);
        job.setReducerClass(MergeKeyReducer.class);

    } else if (mergeBy.equals("value")) {
        job.setMapperClass(MergeValueMapper.class);
        job.setReducerClass(MergeValueReducer.class);
    }

    job.setInputFormatClass(inputFormatClass);
    DelimitedTextInputFormat.setProperties(job, delimiter, column);
    job.setMapOutputKeyClass(HihoTuple.class);
    job.setMapOutputValueClass(HihoValue.class);

    job.setOutputKeyClass(inputKeyClass);
    job.setOutputValueClass(inputValueClass);
    FileInputFormat.setInputPaths(job, oldPath + "," + newPath);
    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    try {
        logger.debug("Output format class is " + job.getOutputFormatClass());
        logger.debug("Class is " + ReflectionUtils
                .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
        job.waitForCompletion(false);
        if (job.isComplete()) {
            Counters counters = job.getCounters();
            totalRecordsOld = counters.findCounter(MergeRecordCounter.TOTAL_RECORDS_OLD).getValue();
            totalRecordsNew = counters.findCounter(MergeRecordCounter.TOTAL_RECORDS_NEW).getValue();
            badRecords = counters.findCounter(MergeRecordCounter.BAD_RECORD).getValue();
            output = counters.findCounter(MergeRecordCounter.OUTPUT).getValue();
            logger.info("Total old records read are: " + totalRecordsOld);
            logger.info("Total new records read are: " + totalRecordsNew);
            logger.info("Bad Records are: " + badRecords);
            logger.info("Output records are: " + output);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    return 0;
}

From source file:co.nubetech.hiho.similarity.ngram.NGramJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    populateConfiguration(args);//from  www. j  av a2  s . co m
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }
    Job job = new Job(conf);
    job.setJobName("NGram job");
    job.setJarByClass(NGramJob.class);

    Class inputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");
    Class outputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat");
    // org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
    // org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
    Class inputKeyClass = Class.forName("org.apache.hadoop.io.Text");
    Class inputValueClass = Class.forName("org.apache.hadoop.io.Text");
    Class outputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
    Class outputValueClass = Class.forName("org.apache.hadoop.io.IntWritable");

    job.setMapperClass(NGramMapper.class);
    job.setReducerClass(NGramReducer.class);

    job.setInputFormatClass(inputFormatClass);
    job.setMapOutputKeyClass(inputKeyClass);
    job.setMapOutputValueClass(inputValueClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    job.setOutputFormatClass(outputFormatClass);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path("outputOfNGramJob"));

    int ret = 0;
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:co.nubetech.hiho.similarity.ngram.ScoreJob.java

License:Apache License

@Override
public int run(String[] arg0) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    job.setJobName("Score job");
    job.setJarByClass(ScoreJob.class);

    Class inputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat");
    Class outputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat");
    // org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
    // org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
    Class inputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
    Class inputValueClass = Class.forName("org.apache.hadoop.io.IntWritable");
    Class outputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
    Class outputValueClass = Class.forName("org.apache.hadoop.io.LongWritable");

    job.setMapperClass(ScoreMapper.class);
    job.setReducerClass(ScoreReducer.class);

    job.setInputFormatClass(inputFormatClass);
    job.setMapOutputKeyClass(inputKeyClass);
    job.setMapOutputValueClass(inputValueClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    job.setOutputFormatClass(outputFormatClass);

    FileInputFormat.setInputPaths(job, "outputOfNGramJob");
    FileOutputFormat.setOutputPath(job, new Path("outputOfScoreJob"));

    int ret = 0;/* ww w  .  j av a 2  s  .c  om*/
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:code.DemoWordCount.java

License:Apache License

/**
 * Runs this tool.//  ww  w.ja  v  a 2s  .c om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + DemoWordCount.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJobName(DemoWordCount.class.getSimpleName());
    job.setJarByClass(DemoWordCount.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(this.getConf());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCount.WordCountMapper.class);
    job.setCombinerClass(WordCount.WordCountCombiner.class);
    job.setReducerClass(WordCount.WordCountReducer.class);

    // clone the articles table
    ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]);
    Connector conn = inst.getConnector(args[2], new PasswordToken(args[3]));

    conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE,
            true, Collections.EMPTY_MAP, Collections.EMPTY_SET);

    // take cloned table offline, waiting until the operation is complete
    boolean wait = true;
    conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    // input/* w  w  w  .j a  v  a2 s  .com*/
    job.setInputFormatClass(AccumuloInputFormat.class);
    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // configure to use underlying RFiles
    AccumuloInputFormat.setOfflineTableScan(job, true);

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.waitForCompletion(true);
    //job.submit();

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(new Configuration());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    // input//from  ww w  . jav a 2 s  .  com
    job.setInputFormatClass(AccumuloInputFormat.class);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig config = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, config);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.submit();
    return 0;
}

From source file:com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput.java

License:Apache License

public int run(final String[] args) throws Exception {
    final Configuration conf = getConf();

    @SuppressWarnings("deprecation")
    final Job job = new Job(conf, "AerospikeAggregateIntInput");

    log.info("run starting on bin " + binName);

    job.setJarByClass(AggregateIntInput.class);
    job.setInputFormatClass(AerospikeInputFormat.class);
    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(LongWritable.class);
    // job.setCombinerClass(Reduce.class); // no combiner
    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(args[0]));

    int status = job.waitForCompletion(true) ? 0 : 1;
    log.info("run finished, status=" + status);
    return status;
}

From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This//from  w  w w .  j a v  a 2  s. c  om
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match HFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     PutSortReducer)</li>
 * </ul>
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    Class<? extends Partitioner> topClass;
    try {
        topClass = getTotalOrderPartitionerClass();
    } catch (ClassNotFoundException e) {
        throw new IOException("Failed getting TotalOrderPartitioner", e);
    }
    //partition
    job.setPartitionerClass(topClass);
    //Set the key class for the job output data
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    //Set the value class for job outputs
    job.setOutputValueClass(KeyValue.class);
    //outputformatHfile
    job.setOutputFormatClass(HFileOutputFormat2.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(SingleColumnReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    //?regionstarkey
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");

    //?region?reduce?
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);

    URI cacheUri;
    try {
        // Below we make explicit reference to the bundled TOP.  Its cheating.
        // We are assume the define in the hbase bundled TOP is as it is in
        // hadoop (whether 0.20 or 0.22, etc.)
        /*
          cacheUri = new URI(partitionsPath.toString() + "#" +
            org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH);
            */
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    // Set compression algorithms based on column families
    configureCompression(table, conf);

    TableMapReduceUtil.addDependencyJars(job);
    LOG.info("Incremental table output configured.");
}

From source file:com.airline.analytics.AirlineDelayAnalytics.java

@Override
public int run(String[] strings) throws Exception {

    Job job = Job.getInstance(getConf(), "Hadoop Airline Delay Analytics");

    job.setJarByClass(AirlineDelayAnalytics.class);

    job.setMapperClass(AirlineMapper.class);
    // job.setCombinerClass(AirlineReducer.class);
    job.setReducerClass(AirlineReducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(strings[0]));
    FileOutputFormat.setOutputPath(job, new Path(strings[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.airline.analytics.AirlineUniqueRoutesAnalytics.java

@Override
public int run(String[] strings) throws Exception {

    Job job = Job.getInstance(getConf(), "Hadoop Airline Orign Destination Analytics");

    job.setJarByClass(getClass());/*from  www  . ja v  a  2  s  .  c  om*/

    // Distributed Cache
    job.addCacheFile(new URI("/airline/codes.csv"));

    job.setMapperClass(AirlineMapper.class);
    // job.setCombinerClass(AirlineReducer.class);
    job.setReducerClass(AirlineReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(strings[0]));
    FileOutputFormat.setOutputPath(job, new Path(strings[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}