Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.chombo.mr.RecordSetModifier.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "record set modifier  MR";
    job.setJobName(jobName);//from  ww w  .j  av a2s  . co m

    job.setJarByClass(RecordSetModifier.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RecordSetModifier.ModifierMapper.class);
    job.setReducerClass(RecordSetModifier.ModifierReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("rsm.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.TimeGapSequenceGenerator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Time sequence to time gap sequence conversion";
    job.setJobName(jobName);//from   w  ww  .  j  a v  a  2s.c  o m

    job.setJarByClass(TimeGapSequenceGenerator.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo", true);
    job.setMapperClass(TimeGapSequenceGenerator.TimeGapMapper.class);
    job.setReducerClass(TimeGapSequenceGenerator.TimeGapReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("tgs.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.TimeSequenceFilter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Time sequence to time gap sequence conversion";
    job.setJobName(jobName);/*from  ww w . j a  va 2  s  . c  o m*/

    job.setJarByClass(TimeSequenceFilter.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo");
    job.setMapperClass(TimeGapSequenceGenerator.TimeGapMapper.class);
    job.setReducerClass(TimeSequenceFilter.FilterReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("tsf.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.WeightedAverage.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Weighted average calculating MR";
    job.setJobName(jobName);/*from  w ww . jav  a  2 s. c o  m*/

    job.setJarByClass(WeightedAverage.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(WeightedAverage.AverageMapper.class);
    job.setReducerClass(WeightedAverage.AverageReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    Utility.setConfiguration(job.getConfiguration());

    if (job.getConfiguration().getInt("group.by.field", -1) >= 0) {
        //group by
        job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
        job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);
    }

    int numReducer = job.getConfiguration().getInt("wea.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.clueweb.clueweb12.app.DuplicateFiltering.java

License:Apache License

/**
 * Runs this tool.//w  w  w  .j av  a2s .  c  o m
 */
@SuppressWarnings({ "static-access", "deprecation" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(TREC_RESULT_FILE));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("float [0-1]").hasArg()
            .withDescription("cosine similarity threshold").create(SIM_THRESHOLD));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(TREC_RESULT_FILE)
            || !cmdline.hasOption(SIM_THRESHOLD) || !cmdline.hasOption(TOPK)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String trecinput = cmdline.getOptionValue(TREC_RESULT_FILE);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String simThreshold = cmdline.getOptionValue(SIM_THRESHOLD);
    String topk = cmdline.getOptionValue(TOPK);

    LOG.info("Tool name: " + DuplicateFiltering.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - trecinputfile: " + trecinput);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - cosine similarity threshold: " + SIM_THRESHOLD);
    LOG.info(" - topk: " + topk);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.setFloat(SIM_THRESHOLD, Float.parseFloat(simThreshold));
    conf.set(TREC_RESULT_FILE, trecinput);
    conf.setInt(TOPK, Integer.parseInt(topk));

    conf.set("mapred.task.timeout", "6000000");// default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output)))
        fs.delete(new Path(output));

    Job job = new Job(conf, DuplicateFiltering.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(DuplicateFiltering.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    int numDuplicates = (int) job.getCounters().findCounter(Records.DUPLICATES).getValue();
    LOG.info("Number of duplicates: " + numDuplicates);

    return 0;
}

From source file:org.clueweb.clueweb12.app.RMRetrieval.java

License:Apache License

/**
 * Runs this tool./*from   w  w w. j  a  va 2  s.  c  om*/
 */
@SuppressWarnings({ "static-access", "deprecation" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("queries").create(QUERIES_OPTION));
    options.addOption(
            OptionBuilder.withArgName("float").hasArg().withDescription("smoothing").create(SMOOTHING));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
            .withDescription("preprocessing").create(PREPROCESSING));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("rmmodel file").create(RMMODEL));
    options.addOption(
            OptionBuilder.withArgName("float").hasArg().withDescription("queryLambda").create(QUERY_LAMBDA));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(QUERIES_OPTION)
            || !cmdline.hasOption(SMOOTHING) || !cmdline.hasOption(TOPK) || !cmdline.hasOption(QUERY_LAMBDA)
            || !cmdline.hasOption(PREPROCESSING)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String queries = cmdline.getOptionValue(QUERIES_OPTION);
    String smoothing = cmdline.getOptionValue(SMOOTHING);
    String topk = cmdline.getOptionValue(TOPK);
    String preprocessing = cmdline.getOptionValue(PREPROCESSING);
    String rmmodel = cmdline.getOptionValue(RMMODEL);
    String queryLambda = cmdline.getOptionValue(QUERY_LAMBDA);

    LOG.info("Tool name: " + RMRetrieval.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - queries: " + queries);
    LOG.info(" - smoothing: " + smoothing);
    LOG.info(" - topk: " + topk);
    LOG.info(" - preprocessing: " + preprocessing);
    LOG.info(" - rmmodel: " + rmmodel);
    LOG.info(" - queryLambda: " + queryLambda);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.set(QUERIES_OPTION, queries);
    conf.setFloat(SMOOTHING, Float.parseFloat(smoothing));
    conf.setInt(TOPK, Integer.parseInt(topk));
    conf.set(PREPROCESSING, preprocessing);
    conf.set(RMMODEL, rmmodel);
    conf.setFloat(QUERY_LAMBDA, Float.parseFloat(queryLambda));

    conf.set("mapreduce.map.memory.mb", "10048");
    conf.set("mapreduce.map.java.opts", "-Xmx10048m");
    conf.set("mapreduce.reduce.memory.mb", "10048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx10048m");
    conf.set("mapred.task.timeout", "6000000");// default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output)))
        fs.delete(new Path(output));

    Job job = new Job(conf, RMRetrieval.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(RMRetrieval.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Open Source License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*from   w w  w. j  a  v a  2 s  .  co  m*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT);

    ggfs.mkdirs(inDir);

    GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        GridFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.hdp.wrdcount.custompartitioner.WordCountCustomPartitionerJob.java

@Override
public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub
    Job job = Job.getInstance(getConf(), "Word Count Job");
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileSystem fs = FileSystem.get(getConf());
    // does not the HDFS setting that is set for the eclipse env
    Path pathOut = new Path("/test/wordcount/custompartitioner/op");
    if (fs.exists(pathOut)) {
        fs.delete(out, true);/*from   w  ww.  j  a v  a 2s  .c  om*/
    }
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setMapperClass(WordCountCustomPartitionerMapper.class);
    job.setReducerClass(WordCountCustomPartitionerReducer.class);
    job.setPartitionerClass(WordCountPartitioner.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(3);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.imageterrier.indexers.hadoop.HadoopIndexer.java

License:Mozilla Public License

protected Job createJob(HadoopIndexerOptions options) throws IOException {
    final Job job = new Job(getConf());
    job.setJobName("terrierIndexing");

    if (options.getInputMode() == InputMode.QUANTISED_FEATURES) {
        job.setMapperClass(QFIndexerMapper.class);
    } else {/*from w w w.  j av  a2 s .c om*/
        if (options.shardPerThread) {
            job.setMapperClass(MultithreadedMapper.class);
            MultithreadedMapper.setMapperClass(job, MTImageIndexerMapper.class);
            MultithreadedMapper.setNumberOfThreads(job, options.getMultithread());
        } else {
            job.setMapperClass(ImageIndexerMapper.class);
        }
    }
    // Load quantiser (if it exists), extract header, count codebook size
    if (options.getInputModeOptions().hasQuantiserFile()) {
        final String quantFile = options.getInputModeOptions().getQuantiserFile();
        System.out.println("Loading codebook to see its size");
        final SpatialClusters<?> quantiser = readClusters(options);
        System.out.println("Setting codebook size: " + quantiser.numClusters());
        job.getConfiguration().setInt(QUANTISER_SIZE, quantiser.numClusters());
        if (quantiser.numClusters() < options.getNumReducers())
            options.setNumReducers(quantiser.numClusters());
    }
    job.setReducerClass(IndexerReducer.class);

    FileOutputFormat.setOutputPath(job, options.getOutputPath());
    job.setMapOutputKeyClass(NewSplitEmittedTerm.class);
    job.setMapOutputValueClass(MapEmittedPostingList.class);
    job.getConfiguration().setBoolean("indexing.hadoop.multiple.indices", options.isDocumentPartitionMode());

    // if
    // (!job.getConfiguration().get("mapred.job.tracker").equals("local")) {
    // job.getConfiguration().set("mapred.map.output.compression.codec",
    // GzipCodec.class.getCanonicalName());
    // job.getConfiguration().setBoolean("mapred.compress.map.output",
    // true);
    // } else {
    job.getConfiguration().setBoolean("mapred.compress.map.output", false);
    // }

    job.setInputFormatClass(PositionAwareSequenceFileInputFormat.class); // important

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setSortComparatorClass(NewSplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    job.setGroupingComparatorClass(NewSplitEmittedTerm.SETRawComparatorTerm.class);

    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    SequenceFileInputFormat.setInputPaths(job, options.getInputPaths());

    job.setNumReduceTasks(options.getNumReducers());
    if (options.getNumReducers() > 1) {
        if (options.isDocumentPartitionMode()) {
            job.setPartitionerClass(NewSplitEmittedTerm.SETPartitioner.class);
        } else {
            // job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
            if (job.getConfiguration().getInt(QUANTISER_SIZE, -1) == -1) {
                job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerHashedTerm.class);
            } else {
                job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerCodebookAwareTerm.class);
            }

        }
    } else {
        // for JUnit tests, we seem to need to restore the original
        // partitioner class
        job.setPartitionerClass(HashPartitioner.class);
    }

    job.setJarByClass(this.getClass());

    return job;
}

From source file:org.kiji.mapreduce.output.HFileMapReduceJobOutput.java

License:Apache License

/**
 * Configures the partitioner for generating HFiles.
 *
 * <p>Each generated HFile should fit within a region of of the target table.
 * Additionally, it's optimal to have only one HFile to load into each region, since a
 * read from that region will require reading from each HFile under management (until
 * compaction happens and merges them all back into one HFile).</p>
 *
 * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
 * records output from the Mapper based on their rank in a total ordering of the
 * keys.  The <code>startKeys</code> argument should contain a list of the first key in
 * each of those partitions.</p>//w ww  . j  ava2 s . c o m
 *
 * @param job The job to configure.
 * @param startKeys A list of keys that will mark the boundaries between the partitions
 *     for the sorted map output records.
 * @throws IOException If there is an error.
 */
private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException {
    job.setPartitionerClass(TotalOrderPartitioner.class);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
        final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
}