Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the map output data.

Usage

From source file:edu.ucla.sspace.hadoop.HadoopJob.java

License:Open Source License

/**
 * Exceutes the word co-occurrence counting job on the corpus files in the
 * input directory using the current Hadoop instance, returning an iterator
 * over all the occurrences frequences found in the corpus.
 *
 * @param inputPaths the directories on the Hadoop distributed file system
 *        containing all the corpus files that will be processed
 *
 * @return an iterator over the unique {@link WordCooccurrence} counts found
 *         in the corpus.  Note that if two words co-occur the same distance
 *         apart multiple times, only one {@code WordCooccurrence} is
 *         returned, where the number of co-occurrences is reflected by the
 *         the {@link WordCooccurrence#getCount() getCount()} method.
 *
 * @throws Exception if Hadoop throws an {@code Exception} during its
 *         execution or if the resulting output files cannot be read.
 *//*from   w ww  .  ja v a  2 s  . c  o  m*/
public HadoopJobResults run(Collection<String> inputPaths) throws Exception {

    // Create a mostly unique file name for the output directory.
    String outputDir = "output-" + System.currentTimeMillis();
    //conf.setBoolean("mapred.task.profile", true);

    Job job = new Job(conf, mapperClass.getName() + "-" + reducerClass.getName());

    job.setJarByClass(HadoopJob.class);
    job.setMapperClass(mapperClass);
    job.setReducerClass(reducerClass);

    job.setMapOutputKeyClass(mapperOutputKey);
    job.setMapOutputValueClass(mapperOutputValue);
    job.setOutputKeyClass(outputKey);
    job.setOutputValueClass(outputValue);

    // Add all the specified directories as input paths for the job
    for (String inputDir : inputPaths)
        FileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputDirPath = new Path(outputDir);
    FileOutputFormat.setOutputPath(job, outputDirPath);

    job.waitForCompletion(true);

    // From the output directory, collect all the results files 
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] outputFiles = fs.listStatus(outputDirPath, new OutputFilePathFilter());
    Collection<Path> paths = new LinkedList<Path>();
    for (FileStatus status : outputFiles) {
        paths.add(status.getPath());
    }

    return new HadoopJobResults(fs, paths);
}

From source file:edu.udel.mxv.Mxv.java

@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        System.err.println(USAGE);
        System.exit(1);//from   ww w. j  a  v  a2 s  .co  m
    }

    int n = Integer.parseInt(args[0]);
    String input_matrix = args[1];
    String input_vector = args[2];
    String output = args[3];

    Configuration conf = getConf();
    conf.set("vector.path", input_vector);
    conf.setInt("vector.n", n);

    Job job = new Job(conf);
    job.setJobName("mxv");
    job.setJarByClass(getClass());

    // mapper
    job.setMapperClass(MxvMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    // reducer
    job.setReducerClass(MxvRed.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    //        job.setNumReduceTasks(num_red);

    FileInputFormat.addInputPath(job, new Path(input_matrix));
    FileOutputFormat.setOutputPath(job, new Path(output));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase1(String inputPath, int reduceNo, String lang)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "tmp/wiki-link/phase1";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*from   w w w . j  a  v  a2s .  com*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    if ("en".equals(lang)) {
        job.setInputFormatClass(WikipediaPageInputFormat.class);
    } else
        throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported ");

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfStringInt.class);

    job.setMapperClass(LinkEmitMapClass.class);
    job.setReducerClass(RedirectResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase2(String inputPath, int reduceNo)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "tmp/wiki-link/phase2";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*from  ww  w  . j  a  v  a  2s .c  o m*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfStringInt.class);

    job.setReducerClass(DestinationIdResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase3(String inputPath, int reduceNo)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "trace/phase3";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*from  ww w .  ja v  a2 s  .c  o m*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setReducerClass(SourceIdResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task1(String inputPath, String outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    LOG.info("Exracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class);
    job.setJobName(/*from ww w . jav  a 2s.  c o  m*/
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    job.setNumReduceTasks(10);

    // increase heap
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    job.getConfiguration().set("mapreduce.map.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    // job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(PairOfStringInt.class);
    job.setMapOutputValueClass(PairOfStrings.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PairOfIntString.class);

    job.setMapperClass(MyMapper1.class);
    job.setReducerClass(MyReducer1.class);
    job.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task2(String inputPath, String outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    LOG.info("Exracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class);
    job.setJobName(/*from   w  ww.  j  a v  a 2s  .  c o m*/
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    job.setNumReduceTasks(1);

    // increase heap
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    job.getConfiguration().set("mapreduce.map.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(HMapSIW.class);

    job.setMapperClass(MyMapper2.class);
    job.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    // Clean up intermediate data.
    FileSystem.get(job.getConfiguration()).delete(new Path(inputPath), true);
}

From source file:edu.umd.cloud9.example.bfs.EncodeBfsGraph.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from www  . j  a  v  a2 s.c  om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("nodeid").hasArg().withDescription("source node").create(SRC_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(SRC_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int src = Integer.parseInt(cmdline.getOptionValue(SRC_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - src: " + src);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("EncodeBfsGraph[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, SRC_OPTION, src));
    job.setJarByClass(EncodeBfsGraph.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt(SRC_OPTION, src);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.example.bfs.FindNodeAtDistance.java

License:Apache License

@SuppressWarnings("static-access")
@Override/* w w w. j av  a 2 s .  co  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("distance").create(DISTANCE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DISTANCE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int distance = Integer.parseInt(cmdline.getOptionValue(DISTANCE_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - distance: " + distance);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("FindNodeAtDistance[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, DISTANCE_OPTION, distance));
    job.setJarByClass(FindNodeAtDistance.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt(DISTANCE_OPTION, distance);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}