Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:edu.ucla.sspace.hadoop.HadoopJob.java

License:Open Source License

/**
 * Exceutes the word co-occurrence counting job on the corpus files in the
 * input directory using the current Hadoop instance, returning an iterator
 * over all the occurrences frequences found in the corpus.
 *
 * @param inputPaths the directories on the Hadoop distributed file system
 *        containing all the corpus files that will be processed
 *
 * @return an iterator over the unique {@link WordCooccurrence} counts found
 *         in the corpus.  Note that if two words co-occur the same distance
 *         apart multiple times, only one {@code WordCooccurrence} is
 *         returned, where the number of co-occurrences is reflected by the
 *         the {@link WordCooccurrence#getCount() getCount()} method.
 *
 * @throws Exception if Hadoop throws an {@code Exception} during its
 *         execution or if the resulting output files cannot be read.
 *//*from   w ww  .  ja v a  2 s  . c  o  m*/
public HadoopJobResults run(Collection<String> inputPaths) throws Exception {

    // Create a mostly unique file name for the output directory.
    String outputDir = "output-" + System.currentTimeMillis();
    //conf.setBoolean("mapred.task.profile", true);

    Job job = new Job(conf, mapperClass.getName() + "-" + reducerClass.getName());

    job.setJarByClass(HadoopJob.class);
    job.setMapperClass(mapperClass);
    job.setReducerClass(reducerClass);

    job.setMapOutputKeyClass(mapperOutputKey);
    job.setMapOutputValueClass(mapperOutputValue);
    job.setOutputKeyClass(outputKey);
    job.setOutputValueClass(outputValue);

    // Add all the specified directories as input paths for the job
    for (String inputDir : inputPaths)
        FileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputDirPath = new Path(outputDir);
    FileOutputFormat.setOutputPath(job, outputDirPath);

    job.waitForCompletion(true);

    // From the output directory, collect all the results files 
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] outputFiles = fs.listStatus(outputDirPath, new OutputFilePathFilter());
    Collection<Path> paths = new LinkedList<Path>();
    for (FileStatus status : outputFiles) {
        paths.add(status.getPath());
    }

    return new HadoopJobResults(fs, paths);
}

From source file:edu.udel.mxv.Mxv.java

@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        System.err.println(USAGE);
        System.exit(1);//from   ww w. j  a  v  a2 s  .co  m
    }

    int n = Integer.parseInt(args[0]);
    String input_matrix = args[1];
    String input_vector = args[2];
    String output = args[3];

    Configuration conf = getConf();
    conf.set("vector.path", input_vector);
    conf.setInt("vector.n", n);

    Job job = new Job(conf);
    job.setJobName("mxv");
    job.setJarByClass(getClass());

    // mapper
    job.setMapperClass(MxvMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    // reducer
    job.setReducerClass(MxvRed.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    //        job.setNumReduceTasks(num_red);

    FileInputFormat.addInputPath(job, new Path(input_matrix));
    FileOutputFormat.setOutputPath(job, new Path(output));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase1(String inputPath, int reduceNo, String lang)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "tmp/wiki-link/phase1";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*from   w w w . j  a  v  a2s .  com*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    if ("en".equals(lang)) {
        job.setInputFormatClass(WikipediaPageInputFormat.class);
    } else
        throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported ");

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfStringInt.class);

    job.setMapperClass(LinkEmitMapClass.class);
    job.setReducerClass(RedirectResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase2(String inputPath, int reduceNo)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "tmp/wiki-link/phase2";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*from  ww  w  . j  a  v  a  2s .c  o m*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfStringInt.class);

    job.setReducerClass(DestinationIdResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase3(String inputPath, int reduceNo)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "trace/phase3";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*from  ww w .  ja v  a2 s  .c  o m*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setReducerClass(SourceIdResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task1(String inputPath, String outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    LOG.info("Exracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class);
    job.setJobName(/*from ww w . jav  a 2s.  c o  m*/
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    job.setNumReduceTasks(10);

    // increase heap
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    job.getConfiguration().set("mapreduce.map.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    // job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(PairOfStringInt.class);
    job.setMapOutputValueClass(PairOfStrings.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PairOfIntString.class);

    job.setMapperClass(MyMapper1.class);
    job.setReducerClass(MyReducer1.class);
    job.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task2(String inputPath, String outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    LOG.info("Exracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class);
    job.setJobName(/*from   w  ww.  j  a v  a 2s  .  c o m*/
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    job.setNumReduceTasks(1);

    // increase heap
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    job.getConfiguration().set("mapreduce.map.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(HMapSIW.class);

    job.setMapperClass(MyMapper2.class);
    job.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    // Clean up intermediate data.
    FileSystem.get(job.getConfiguration()).delete(new Path(inputPath), true);
}

From source file:edu.umd.cloud9.example.bfs.EncodeBfsGraph.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from www  . j  a  v  a2 s.c  om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("nodeid").hasArg().withDescription("source node").create(SRC_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(SRC_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int src = Integer.parseInt(cmdline.getOptionValue(SRC_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - src: " + src);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("EncodeBfsGraph[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, SRC_OPTION, src));
    job.setJarByClass(EncodeBfsGraph.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt(SRC_OPTION, src);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.example.bfs.FindNodeAtDistance.java

License:Apache License

@SuppressWarnings("static-access")
@Override/* w w w. j av  a 2 s .  co  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("distance").create(DISTANCE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DISTANCE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int distance = Integer.parseInt(cmdline.getOptionValue(DISTANCE_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - distance: " + distance);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("FindNodeAtDistance[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, DISTANCE_OPTION, distance));
    job.setJarByClass(FindNodeAtDistance.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt(DISTANCE_OPTION, distance);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}