Example usage for org.apache.hadoop.mapred JobConf setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> theClass)

Source Link

Document

Set the Partitioner class used to partition Mapper -outputs to be sent to the Reducer s.

Usage

From source file:edu.ucsb.cs.sort.norm.NormSortMain.java

License:Apache License

/**
 * Main method sets the job configurations including the mapper and reducer
 * classes to do the sorting. Some of the produced partitions might be
 * merged later to reflect the number of partitions chosen by the user.
 */// w w  w . j  a v a2 s . c om
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName("NormSort");
    job.setJarByClass(NormSortMain.class);
    job.setMapperClass(NormSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(NormRangePartitioner.class);

    job.setReducerClass(NormSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    //
    // run
    //
    JobSubmitter.run(job, "Sort By p-norm", -1);
}

From source file:edu.ucsb.cs.sort.signature.SigSortMain.java

License:Apache License

/**
 * Sets the job configurations including the mapper and reducer classes to
 * do the sorting based signatures.//from   w  w  w . ja  v  a2 s. com
 */
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName(SigSortMain.class.getSimpleName());
    job.setJarByClass(SigSortMain.class);
    job.setMapperClass(SigSortMapper.class);
    job.setMapOutputKeyClass(BitSignature.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setPartitionerClass(SigRangePartitioner.class);

    job.setReducerClass(SigSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BitSignature.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(OUTPUT_PATH);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    //
    // run
    //
    JobSubmitter.run(job, "Sort By Signature Bytes", -1);
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaLinkGraph.java

License:Apache License

private void task1(String inputPath, String outputPath, int partitions) throws IOException {
    LOG.info("Exracting edges...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), BuildWikipediaLinkGraph.class);
    conf.setJobName(String.format("BuildWikipediaLinkGraph:Edges[input: %s, output: %s, num_partitions: %d]",
            inputPath, outputPath, partitions));

    conf.setNumReduceTasks(partitions);//from   ww  w  . j  a va  2s. c  o  m

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaAnchorText.java

License:Apache License

private void task1(String inputPath, String outputPath) throws IOException {
    LOG.info("Exracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(//w w w  .  ja  va  2  s  .c  o  m
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    conf.setNumReduceTasks(10);

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(PairOfStrings.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PairOfIntString.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaLinkGraph.java

License:Apache License

private void task1(String inputPath, String outputPath, int partitions) throws IOException {
    LOG.info("Exracting edges...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaLinkGraph.class);
    conf.setJobName(String.format("ExtractWikipediaLinkGraph:Edges[input: %s, output: %s, num_partitions: %d]",
            inputPath, outputPath, partitions));

    conf.setNumReduceTasks(partitions);/* w ww . j av  a2 s  .  c o m*/

    SequenceFileInputFormat.addInputPath(conf, new Path(inputPath));
    TextOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.demo.DemoWordCondProbJSON.java

License:Apache License

/**
 * Runs this tool./*from w w  w  .j  av a  2  s .c om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: DemoWordCondProbJSON");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(DemoWordCondProbJSON.class);
    conf.setJobName("DemoWordCondProbJSON");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(MyTuple.class);
    conf.setOutputValueClass(FloatWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    // this is a potential gotcha! can't use ReduceClass for combine because
    // we have not collected all the counts yet, so we can't divide through
    // to compute the conditional probabilities
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.demo.DemoWordCondProbTuple.java

License:Apache License

/**
 * Runs this tool./*from  w  w w.j  a va2s .co  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: DemoWordCondProbTuple");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(DemoWordCondProbTuple.class);
    conf.setJobName("DemoWordCondProbTuple");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(FloatWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    // this is a potential gotcha! can't use ReduceClass for combine because
    // we have not collected all the counts yet, so we can't divide through
    // to compute the conditional probabilities
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.examples.BigramRelativeFrequency.java

License:Apache License

/**
 * Runs this tool./*from   ww w  . j  a v a2s. c o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: BigramRelativeFrequency");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(BigramRelativeFrequency.class);
    conf.setJobName("BigramRelativeFrequency");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    /**
     *  Note that these must match the Class arguments given in the mapper 
     */
    conf.setOutputKeyClass(PairOfStrings.class);
    conf.setOutputValueClass(FloatWritable.class);

    conf.setMapperClass(MyMapper.class);
    conf.setCombinerClass(MyCombiner.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    //Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(outputDir.toUri(), conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.pagerank.PartitionGraph.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length != 5) {
        printUsage();//from w  ww. j av a  2 s.c om
        return -1;
    }

    String inPath = args[0];
    String outPath = args[1];
    int numParts = Integer.parseInt(args[2]);
    boolean useRange = Integer.parseInt(args[3]) != 0;
    int nodeCount = Integer.parseInt(args[4]);

    sLogger.info("Tool name: PartitionGraph");
    sLogger.info(" - inputDir: " + inPath);
    sLogger.info(" - outputDir: " + outPath);
    sLogger.info(" - numPartitions: " + numParts);
    sLogger.info(" - useRange?: " + useRange);
    sLogger.info(" - nodeCnt: " + nodeCount);

    JobConf conf = new JobConf(PartitionGraph.class);

    conf.setJobName("Partition Graph " + numParts);
    conf.setNumReduceTasks(numParts);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("NodeCount", nodeCount);

    FileInputFormat.setInputPaths(conf, new Path(inPath));
    FileOutputFormat.setOutputPath(conf, new Path(outPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    FileSystem.get(conf).delete(new Path(outPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;//from   w w  w  .  j  a v  a  2 s  .co m
    }

    sLogger.info("PageRank: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(conf);
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}