Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.chombo.mr.RecordSetModifier.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "record set modifier  MR";
    job.setJobName(jobName);//from  ww w  .j  av a2s  . co m

    job.setJarByClass(RecordSetModifier.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RecordSetModifier.ModifierMapper.class);
    job.setReducerClass(RecordSetModifier.ModifierReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("rsm.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.TimeGapSequenceGenerator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Time sequence to time gap sequence conversion";
    job.setJobName(jobName);//from   w  ww  .  j  a v  a  2s.c  o m

    job.setJarByClass(TimeGapSequenceGenerator.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo", true);
    job.setMapperClass(TimeGapSequenceGenerator.TimeGapMapper.class);
    job.setReducerClass(TimeGapSequenceGenerator.TimeGapReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("tgs.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.TimeSequenceFilter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Time sequence to time gap sequence conversion";
    job.setJobName(jobName);/*from  ww w . j a  va 2  s  . c  o m*/

    job.setJarByClass(TimeSequenceFilter.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo");
    job.setMapperClass(TimeGapSequenceGenerator.TimeGapMapper.class);
    job.setReducerClass(TimeSequenceFilter.FilterReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("tsf.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.WeightedAverage.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Weighted average calculating MR";
    job.setJobName(jobName);/*from  w ww . jav  a  2 s. c o  m*/

    job.setJarByClass(WeightedAverage.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(WeightedAverage.AverageMapper.class);
    job.setReducerClass(WeightedAverage.AverageReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    Utility.setConfiguration(job.getConfiguration());

    if (job.getConfiguration().getInt("group.by.field", -1) >= 0) {
        //group by
        job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
        job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);
    }

    int numReducer = job.getConfiguration().getInt("wea.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.clueweb.clueweb12.app.DuplicateFiltering.java

License:Apache License

/**
 * Runs this tool.//w  w  w  .j av  a2s .  c  o m
 */
@SuppressWarnings({ "static-access", "deprecation" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(TREC_RESULT_FILE));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("float [0-1]").hasArg()
            .withDescription("cosine similarity threshold").create(SIM_THRESHOLD));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(TREC_RESULT_FILE)
            || !cmdline.hasOption(SIM_THRESHOLD) || !cmdline.hasOption(TOPK)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String trecinput = cmdline.getOptionValue(TREC_RESULT_FILE);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String simThreshold = cmdline.getOptionValue(SIM_THRESHOLD);
    String topk = cmdline.getOptionValue(TOPK);

    LOG.info("Tool name: " + DuplicateFiltering.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - trecinputfile: " + trecinput);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - cosine similarity threshold: " + SIM_THRESHOLD);
    LOG.info(" - topk: " + topk);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.setFloat(SIM_THRESHOLD, Float.parseFloat(simThreshold));
    conf.set(TREC_RESULT_FILE, trecinput);
    conf.setInt(TOPK, Integer.parseInt(topk));

    conf.set("mapred.task.timeout", "6000000");// default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output)))
        fs.delete(new Path(output));

    Job job = new Job(conf, DuplicateFiltering.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(DuplicateFiltering.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    int numDuplicates = (int) job.getCounters().findCounter(Records.DUPLICATES).getValue();
    LOG.info("Number of duplicates: " + numDuplicates);

    return 0;
}

From source file:org.clueweb.clueweb12.app.RMRetrieval.java

License:Apache License

/**
 * Runs this tool./*from   w  w w. j  a  va 2  s.  c  om*/
 */
@SuppressWarnings({ "static-access", "deprecation" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("input path (pfor format expected, add * to retrieve files)")
            .create(DOCVECTOR_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("dictionary").create(DICTIONARY_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("queries").create(QUERIES_OPTION));
    options.addOption(
            OptionBuilder.withArgName("float").hasArg().withDescription("smoothing").create(SMOOTHING));
    options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("topk").create(TOPK));
    options.addOption(OptionBuilder.withArgName("string " + AnalyzerFactory.getOptions()).hasArg()
            .withDescription("preprocessing").create(PREPROCESSING));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("rmmodel file").create(RMMODEL));
    options.addOption(
            OptionBuilder.withArgName("float").hasArg().withDescription("queryLambda").create(QUERY_LAMBDA));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(DOCVECTOR_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DICTIONARY_OPTION) || !cmdline.hasOption(QUERIES_OPTION)
            || !cmdline.hasOption(SMOOTHING) || !cmdline.hasOption(TOPK) || !cmdline.hasOption(QUERY_LAMBDA)
            || !cmdline.hasOption(PREPROCESSING)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String docvector = cmdline.getOptionValue(DOCVECTOR_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String dictionary = cmdline.getOptionValue(DICTIONARY_OPTION);
    String queries = cmdline.getOptionValue(QUERIES_OPTION);
    String smoothing = cmdline.getOptionValue(SMOOTHING);
    String topk = cmdline.getOptionValue(TOPK);
    String preprocessing = cmdline.getOptionValue(PREPROCESSING);
    String rmmodel = cmdline.getOptionValue(RMMODEL);
    String queryLambda = cmdline.getOptionValue(QUERY_LAMBDA);

    LOG.info("Tool name: " + RMRetrieval.class.getSimpleName());
    LOG.info(" - docvector: " + docvector);
    LOG.info(" - output: " + output);
    LOG.info(" - dictionary: " + dictionary);
    LOG.info(" - queries: " + queries);
    LOG.info(" - smoothing: " + smoothing);
    LOG.info(" - topk: " + topk);
    LOG.info(" - preprocessing: " + preprocessing);
    LOG.info(" - rmmodel: " + rmmodel);
    LOG.info(" - queryLambda: " + queryLambda);

    Configuration conf = getConf();
    conf.set(DICTIONARY_OPTION, dictionary);
    conf.set(QUERIES_OPTION, queries);
    conf.setFloat(SMOOTHING, Float.parseFloat(smoothing));
    conf.setInt(TOPK, Integer.parseInt(topk));
    conf.set(PREPROCESSING, preprocessing);
    conf.set(RMMODEL, rmmodel);
    conf.setFloat(QUERY_LAMBDA, Float.parseFloat(queryLambda));

    conf.set("mapreduce.map.memory.mb", "10048");
    conf.set("mapreduce.map.java.opts", "-Xmx10048m");
    conf.set("mapreduce.reduce.memory.mb", "10048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx10048m");
    conf.set("mapred.task.timeout", "6000000");// default is 600000

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(new Path(output)))
        fs.delete(new Path(output));

    Job job = new Job(conf, RMRetrieval.class.getSimpleName() + ":" + docvector);
    job.setJarByClass(RMRetrieval.class);

    FileInputFormat.setInputPaths(job, docvector);
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(PairOfIntString.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Open Source License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*from   w w  w. j  a  v a  2 s  .  co  m*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT);

    ggfs.mkdirs(inDir);

    GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        GridFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.hdp.wrdcount.custompartitioner.WordCountCustomPartitionerJob.java

@Override
public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub
    Job job = Job.getInstance(getConf(), "Word Count Job");
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileSystem fs = FileSystem.get(getConf());
    // does not the HDFS setting that is set for the eclipse env
    Path pathOut = new Path("/test/wordcount/custompartitioner/op");
    if (fs.exists(pathOut)) {
        fs.delete(out, true);/*from   w  ww.  j  a v  a 2s  .c  om*/
    }
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setMapperClass(WordCountCustomPartitionerMapper.class);
    job.setReducerClass(WordCountCustomPartitionerReducer.class);
    job.setPartitionerClass(WordCountPartitioner.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(3);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.imageterrier.indexers.hadoop.HadoopIndexer.java

License:Mozilla Public License

protected Job createJob(HadoopIndexerOptions options) throws IOException {
    final Job job = new Job(getConf());
    job.setJobName("terrierIndexing");

    if (options.getInputMode() == InputMode.QUANTISED_FEATURES) {
        job.setMapperClass(QFIndexerMapper.class);
    } else {/*from w w w.  j av  a2 s .c om*/
        if (options.shardPerThread) {
            job.setMapperClass(MultithreadedMapper.class);
            MultithreadedMapper.setMapperClass(job, MTImageIndexerMapper.class);
            MultithreadedMapper.setNumberOfThreads(job, options.getMultithread());
        } else {
            job.setMapperClass(ImageIndexerMapper.class);
        }
    }
    // Load quantiser (if it exists), extract header, count codebook size
    if (options.getInputModeOptions().hasQuantiserFile()) {
        final String quantFile = options.getInputModeOptions().getQuantiserFile();
        System.out.println("Loading codebook to see its size");
        final SpatialClusters<?> quantiser = readClusters(options);
        System.out.println("Setting codebook size: " + quantiser.numClusters());
        job.getConfiguration().setInt(QUANTISER_SIZE, quantiser.numClusters());
        if (quantiser.numClusters() < options.getNumReducers())
            options.setNumReducers(quantiser.numClusters());
    }
    job.setReducerClass(IndexerReducer.class);

    FileOutputFormat.setOutputPath(job, options.getOutputPath());
    job.setMapOutputKeyClass(NewSplitEmittedTerm.class);
    job.setMapOutputValueClass(MapEmittedPostingList.class);
    job.getConfiguration().setBoolean("indexing.hadoop.multiple.indices", options.isDocumentPartitionMode());

    // if
    // (!job.getConfiguration().get("mapred.job.tracker").equals("local")) {
    // job.getConfiguration().set("mapred.map.output.compression.codec",
    // GzipCodec.class.getCanonicalName());
    // job.getConfiguration().setBoolean("mapred.compress.map.output",
    // true);
    // } else {
    job.getConfiguration().setBoolean("mapred.compress.map.output", false);
    // }

    job.setInputFormatClass(PositionAwareSequenceFileInputFormat.class); // important

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setSortComparatorClass(NewSplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    job.setGroupingComparatorClass(NewSplitEmittedTerm.SETRawComparatorTerm.class);

    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    SequenceFileInputFormat.setInputPaths(job, options.getInputPaths());

    job.setNumReduceTasks(options.getNumReducers());
    if (options.getNumReducers() > 1) {
        if (options.isDocumentPartitionMode()) {
            job.setPartitionerClass(NewSplitEmittedTerm.SETPartitioner.class);
        } else {
            // job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
            if (job.getConfiguration().getInt(QUANTISER_SIZE, -1) == -1) {
                job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerHashedTerm.class);
            } else {
                job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerCodebookAwareTerm.class);
            }

        }
    } else {
        // for JUnit tests, we seem to need to restore the original
        // partitioner class
        job.setPartitionerClass(HashPartitioner.class);
    }

    job.setJarByClass(this.getClass());

    return job;
}

From source file:org.kiji.mapreduce.output.HFileMapReduceJobOutput.java

License:Apache License

/**
 * Configures the partitioner for generating HFiles.
 *
 * <p>Each generated HFile should fit within a region of of the target table.
 * Additionally, it's optimal to have only one HFile to load into each region, since a
 * read from that region will require reading from each HFile under management (until
 * compaction happens and merges them all back into one HFile).</p>
 *
 * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
 * records output from the Mapper based on their rank in a total ordering of the
 * keys.  The <code>startKeys</code> argument should contain a list of the first key in
 * each of those partitions.</p>//w ww  . j  ava2 s . c o m
 *
 * @param job The job to configure.
 * @param startKeys A list of keys that will mark the boundaries between the partitions
 *     for the sorted map output records.
 * @throws IOException If there is an error.
 */
private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException {
    job.setPartitionerClass(TotalOrderPartitioner.class);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
        final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
}