Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.apache.hadoop.examples.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysort <in> <out>");
        System.exit(2);//from w w w. j  a v a  2  s  .c om
    }
    Job job = Job.getInstance(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.apache.hadoop.examples.terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        usage();/*from  w  w  w.j  a  va 2 s .c o  m*/
        return 2;
    }
    LOG.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    boolean useSimplePartitioner = getUseSimplePartitioner(job);
    TeraInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TeraInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
        try {
            TeraInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            LOG.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    int ret = job.waitForCompletion(true) ? 0 : 1;
    LOG.info("done");
    return ret;
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*from  w w w .j  ava  2 s  .c  o m*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*from w ww.ja  v  a  2s.  co  m*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(HadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 *
 * @param striped Whether output should be striped or not.
 * @throws Exception If fails./* www  .j ava 2s  .  c  om*/
 */
public void checkMultiReducerWholeMapReduceExecution(boolean striped) throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        if (striped)
            jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "true");
        else
            jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "false");

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI, false);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(HadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopTeraSortTest.java

License:Apache License

/**
 * Creates Job instance and sets up necessary properties for it.
 * @param conf The Job config.//  www  . j  ava  2s . c om
 * @return The job.
 * @throws Exception On error.
 */
private Job setupConfig(JobConf conf) throws Exception {
    Job job = Job.getInstance(conf);

    Path inputDir = new Path(generateOutDir);
    Path outputDir = new Path(sortOutDir);

    boolean useSimplePartitioner = TeraSort.getUseSimplePartitioner(job);

    TeraInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setJobName("TeraSort");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TeraInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);

    if (useSimplePartitioner)
        job.setPartitionerClass(TeraSort.SimplePartitioner.class);
    else {
        long start = System.currentTimeMillis();

        Path partFile = new Path(outputDir, PARTITION_FILENAME);

        URI partUri = new URI(partFile.toString() + "#" + PARTITION_FILENAME);

        try {
            TeraInputFormat.writePartitionFile(job, partFile);
        } catch (Throwable e) {
            throw new RuntimeException(e);
        }

        job.addCacheFile(partUri);

        long end = System.currentTimeMillis();

        System.out.println("Spent " + (end - start) + "ms computing partitions. "
                + "Partition file added to distributed cache: " + partUri);

        job.setPartitionerClass(getTeraSortTotalOrderPartitioner()/*TeraSort.TotalOrderPartitioner.class*/);
    }

    job.getConfiguration().setInt("dfs.replication", TeraSort.getOutputReplication(job));

    /* TeraOutputFormat.setFinalSync(job, true); */
    Method m = TeraOutputFormat.class.getDeclaredMethod("setFinalSync", JobContext.class, boolean.class);
    m.setAccessible(true);
    m.invoke(null, job, true);

    return job;
}

From source file:org.apache.jena.tdbloader4.Utils.java

License:Apache License

public static void setReducers(Job job, Configuration configuration, Logger log) {
    boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT);
    int num_reducers = configuration.getInt(Constants.OPTION_NUM_REDUCERS,
            Constants.OPTION_NUM_REDUCERS_DEFAULT);

    // TODO: should we comment this out and let Hadoop decide the number of reducers?
    if (runLocal) {
        if (log != null)
            log.debug("Setting number of reducers to {}", 1);
        job.setNumReduceTasks(1);//from   ww w . ja va2 s  . com
    } else {
        if (Constants.NAME_FOURTH.equals(job.getJobName())) {
            job.setPartitionerClass(TotalOrderPartitioner.class);
            num_reducers = 9 * num_reducers;
        }
        job.setNumReduceTasks(num_reducers);
        if (log != null)
            log.debug("Setting number of reducers to {}", num_reducers);
    }
}

From source file:org.apache.kylin.storage.hbase.steps.HFileOutputFormat3.java

License:Apache License

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 *//* ww  w .  j a v  a 2s .  com*/
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints) throws IOException {
    Configuration conf = job.getConfiguration();
    // create the partitions file
    FileSystem fs = FileSystem.get(conf);
    Path partitionsPath = new Path(conf.get("hbase.fs.tmp.dir"), "partitions_" + RandomUtil.randomUUID());
    fs.makeQualified(partitionsPath);
    writePartitions(conf, partitionsPath, splitPoints);
    fs.deleteOnExit(partitionsPath);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

From source file:org.apache.mahout.cf.taste.hadoop.als.PredictionJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addOption("pairs", "p", "path containing the test ratings, each line must be: userID,itemID", true);
    addOption("userFeatures", "u", "path to the user feature matrix", true);
    addOption("itemFeatures", "i", "path to the item feature matrix", true);
    addOutputOption();//from   ww  w .  j  a v a 2s. co  m

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path pairs = new Path(parsedArgs.get("--pairs"));
    Path userFeatures = new Path(parsedArgs.get("--userFeatures"));
    Path itemFeatures = new Path(parsedArgs.get("--itemFeatures"));

    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path convertedPairs = new Path(tempDirPath, "convertedPairs");
    Path convertedUserFeatures = new Path(tempDirPath, "convertedUserFeatures");
    Path convertedItemFeatures = new Path(tempDirPath, "convertedItemFeatures");

    Path pairsJoinedWithItemFeatures = new Path(tempDirPath, "pairsJoinedWithItemFeatures");

    Job convertPairs = prepareJob(pairs, convertedPairs, TextInputFormat.class, PairsMapper.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertPairs.waitForCompletion(true);

    Job convertUserFeatures = prepareJob(userFeatures, convertedUserFeatures, SequenceFileInputFormat.class,
            FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertUserFeatures.waitForCompletion(true);

    Job convertItemFeatures = prepareJob(itemFeatures, convertedItemFeatures, SequenceFileInputFormat.class,
            FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertItemFeatures.waitForCompletion(true);

    Job joinPairsWithItemFeatures = prepareJob(new Path(convertedPairs + "," + convertedItemFeatures),
            pairsJoinedWithItemFeatures, SequenceFileInputFormat.class, Mapper.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, JoinProbesWithItemFeaturesReducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    joinPairsWithItemFeatures.setPartitionerClass(HashPartitioner.class);
    joinPairsWithItemFeatures.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class);
    joinPairsWithItemFeatures.waitForCompletion(true);

    Job predictRatings = prepareJob(new Path(pairsJoinedWithItemFeatures + "," + convertedUserFeatures),
            getOutputPath(), SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class,
            VectorWithIndexWritable.class, PredictRatingReducer.class, Text.class, NullWritable.class,
            TextOutputFormat.class);
    predictRatings.setPartitionerClass(HashPartitioner.class);
    predictRatings.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class);
    predictRatings.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java

License:Apache License

public static void setJobPartitioner(Job job, Class<? extends Partitioner> partitioner) {
    job.setPartitionerClass(partitioner);
}