Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:io.druid.indexer.IndexGeneratorJob.java

License:Apache License

public boolean run() {
    try {/* www.j  a v  a  2  s. c o m*/
        Job job = Job.getInstance(new Configuration(),
                String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.23");

        JobHelper.injectSystemProperties(job);
        config.addJobProperties(job);

        job.setMapperClass(IndexGeneratorMapper.class);
        job.setMapOutputValueClass(BytesWritable.class);

        SortableBytes.useSortableBytesAsMapOutputKey(job);

        int numReducers = Iterables.size(config.getAllBuckets().get());
        if (numReducers == 0) {
            throw new RuntimeException("No buckets?? seems there is no data to index.");
        }

        if (config.getSchema().getTuningConfig().getUseCombiner()) {
            job.setCombinerClass(IndexGeneratorCombiner.class);
            job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class);
        }

        job.setNumReduceTasks(numReducers);
        job.setPartitionerClass(IndexGeneratorPartitioner.class);

        setReducerClass(job);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(IndexGeneratorOutputFormat.class);
        FileOutputFormat.setOutputPath(job, config.makeIntermediatePath());

        config.addInputPaths(job);

        // hack to get druid.processing.bitmap property passed down to hadoop job.
        // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig.
        final String bitmapProperty = "druid.processing.bitmap.type";
        final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty);
        if (bitmapType != null) {
            for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) {
                // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above
                String value = Strings.nullToEmpty(job.getConfiguration().get(property));
                job.getConfiguration().set(property,
                        String.format("-D%s=%s %s", bitmapProperty, bitmapType, value));
            }
        }

        config.intoConfiguration(job);

        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), job);

        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());

        boolean success = job.waitForCompletion(true);

        Counter invalidRowCount = job.getCounters()
                .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER);
        jobStats.setInvalidRowCount(invalidRowCount.getValue());

        return success;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:io.fluo.stress.trie.Init.java

License:Apache License

private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Init.class);

    job.setJobName(Init.class.getName() + "_load");

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
    job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));

    job.setMapperClass(InitMapper.class);
    job.setCombinerClass(InitCombiner.class);
    job.setReducerClass(InitReducer.class);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);

    job.setPartitionerClass(RangePartitioner.class);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    Connector conn = AccumuloUtil.getConnector(props);

    Path splitsPath = new Path(tmp, "splits.txt");

    Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);

    RangePartitioner.setSplitFile(job, splitsPath.toString());
    job.setNumReduceTasks(splits1.size() + 1);

    Path outPath = new Path(tmp, "out");
    AccumuloFileOutputFormat.setOutputPath(job, outPath);

    boolean success = job.waitForCompletion(true);

    if (success) {
        Path failPath = new Path(tmp, "failures");
        fs.mkdirs(failPath);//  w ww  .  jav  a2  s . c om
        conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(),
                failPath.toString(), false);
    }
    return success ? 0 : 1;
}

From source file:ipldataanalysis3.IPLDataAnalysis3.java

@Override
public int run(String[] args) throws Exception {

    if (args.length != 2) {
        System.out.printf("Two parameters are required for Data Analysis for IPL- <input dir> <output dir>\n");
        return -1;
    }//from   ww w  . j a  va 2s . c  o  m

    Job job = new Job(getConf(), "Job1");
    job.setJarByClass(IPLDataAnalysis3.class);
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setMapperClass(DataAnalysisMapper.class);
    job.setNumReduceTasks(13);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setPartitionerClass(DataAnalysisPartitioner.class);
    job.setReducerClass(DataAnalysisReducer.class);
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;

}

From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java

License:Apache License

private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        job.setMapperClass(PipesMapper.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, job.getPartitionerClass());
        job.setPartitionerClass(PipesPartitioner.class);
    }//from  ww w .  ja v  a  2 s  . co  m
    if (!getIsJavaReducer(conf)) {
        job.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            job.setOutputFormatClass(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
        job.setInputFormatClass(PipesNonJavaInputFormat.class);
    }

    if (avroInput != null) {
        if (explicitInputFormat) {
            conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
        } // else let the bridge fall back to the appropriate Avro IF
        switch (avroInput) {
        case K:
            job.setInputFormatClass(PydoopAvroInputKeyBridge.class);
            break;
        case V:
            job.setInputFormatClass(PydoopAvroInputValueBridge.class);
            break;
        case KV:
            job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class);
            break;
        default:
            throw new IllegalArgumentException("Bad Avro input type");
        }
    }
    if (avroOutput != null) {
        if (explicitOutputFormat) {
            conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class);
        } // else let the bridge fall back to the appropriate Avro OF
        conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name());
        switch (avroOutput) {
        case K:
            job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class);
            break;
        case V:
            job.setOutputFormatClass(PydoopAvroOutputValueBridge.class);
            break;
        case KV:
            job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class);
            break;
        default:
            throw new IllegalArgumentException("Bad Avro output type");
        }
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        String msg = "No application program defined.";
        throw new IllegalArgumentException(msg);
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    //FIXME: this is kind of useless if the pipes program is not in c++
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        String msg = "Problem parsing executable URI " + exec;
        IOException ie = new IOException(msg);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();//from   w  w w  . ja  v  a  2 s.c  o m
        return 1;
    }
    try {
        Job job = new Job(new Configuration());
        job.setJobName(getClass().getName());
        Configuration conf = job.getConfiguration();
        CommandLine results = cli.parse(conf, args);
        if (results.hasOption("input")) {
            Path path = new Path(results.getOptionValue("input"));
            FileInputFormat.setInputPaths(job, path);
        }
        if (results.hasOption("output")) {
            Path path = new Path(results.getOptionValue("output"));
            FileOutputFormat.setOutputPath(job, path);
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            explicitInputFormat = true;
            setIsJavaRecordReader(conf, true);
            job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(conf, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(conf, true);
            job.setMapperClass(getClass(results, "map", conf, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(conf, true);
            job.setReducerClass(getClass(results, "reduce", conf, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            explicitOutputFormat = true;
            setIsJavaRecordWriter(conf, true);
            job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class));
        }
        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormatClass());
            }
        }
        if (results.hasOption("avroInput")) {
            avroInput = AvroIO.valueOf(results.getOptionValue("avroInput").toUpperCase());
        }
        if (results.hasOption("avroOutput")) {
            avroOutput = AvroIO.valueOf(results.getOptionValue("avroOutput").toUpperCase());
        }

        if (results.hasOption("program")) {
            setExecutable(conf, results.getOptionValue("program"));
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() };
            // FindBugs complains that creating a URLClassLoader should be
            // in a doPrivileged() block.
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            conf.setClassLoader(loader);
        }
        setupPipesJob(job);
        return job.waitForCompletion(true) ? 0 : 1;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }
}

From source file:it.crs4.seal.demux.Demux.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    LOG.info("starting");

    Configuration conf = getConf();
    DemuxOptionParser parser = new DemuxOptionParser();
    parser.parse(conf, args);//from w w w . j a va2s  .co  m

    conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads());
    conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads());

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");
    if (parser.getNoIndexReads())
        LOG.info("Not expecting to find any index reads.  Will demultiplex based only on lane.");

    // load sample sheet to fail early in case of problems
    DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf);

    // must be called before creating the job, since the job
    // *copies* the Configuration.
    distributeSampleSheet(parser.getSampleSheetPath());

    // Create a Job using the processed conf
    Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0)));

    job.setJarByClass(Demux.class);

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq")));

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(SequencedFragment.class);

    job.setPartitionerClass(SequenceIdLocationPartitioner.class);
    job.setGroupingComparatorClass(GroupByLocationComparator.class);
    job.setSortComparatorClass(TwoOneThreeSortComparator.class);

    job.setReducerClass(Red.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SequencedFragment.class);

    // output
    job.setOutputFormatClass(DemuxOutputFormat.class);
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        if (parser.getCreateLaneContent())
            createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath());
        return 0;
    } else {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    }
}

From source file:it.crs4.seal.prq.PairReadsQSeq.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // defaults//from   w  w  w  .java 2s. c o  m
    conf.set(PrqOptionParser.INPUT_FORMAT_CONF, PrqOptionParser.InputFormatDefault);

    // parse command line
    PrqOptionParser parser = new PrqOptionParser();
    parser.parse(conf, args);

    Job job = new Job(conf, "PairReadsQSeq " + parser.getInputPaths().get(0));
    job.setJarByClass(PairReadsQSeq.class);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
    job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName("prq")));

    job.setMapperClass(PrqMapper.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(GroupByLocationComparator.class);

    job.setReducerClass(PrqReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReadPair.class);

    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:it.crs4.seal.read_sort.ReadSort.java

License:Open Source License

public int run(String[] args) throws Exception {
    LOG.info("starting");

    Configuration conf = getConf();

    ReadSortOptionParser parser = new ReadSortOptionParser();
    parser.parse(conf, args);/*from w  w w.  j  a v  a  2  s.c o m*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    // Create a Job using the processed conf
    Job job = new Job(conf, makeJobName(parser.getInputPaths().get(0)));
    job.setJarByClass(ReadSort.class);

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setMapperClass(ReadSortSamMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(WholeReferencePartitioner.class);

    job.setReducerClass(ReadSortSamReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // output path
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        return 0;
    } else {
        LOG.fatal("ReadSort failed!");
        return 1;
    }
}

From source file:it.crs4.seal.tsv_sort.TsvSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");

    TsvSortOptionParser parser = new TsvSortOptionParser();
    parser.parse(getConf(), args);/*w  w  w. ja va2s  . c  o  m*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    Job job = new Job(getConf());

    job.setJobName("TsvSort " + parser.getInputPaths().get(0));
    job.setJarByClass(TsvSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TsvInputFormat.class);
    job.setOutputFormatClass(TextValueOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);

    // output path
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    FileSystem fs = parser.getOutputPath().getFileSystem(job.getConfiguration());
    /*
     *
     * Pick a random name for the partition file in the same directory as the
     * output path.  So, TsvSort /user/me/input /user/me/output
     * results in the partition file being placed in /user/me/_partition.lst.12340921387402174
     *
     * Why not place it directly in the input path?
     *
     *   We wouldn't be able to run two sorts on the same data at the same time.
     *   We've received complaints about this in the past, so it has been a
     *   limit in practice.
     *
     * Why not place it directly in the output path?
     *
     *   We'd have to create the output path before the output format did.
     *   For this to work we'd have to disable the FileOutputFormat's default check
     *   that verifies that the output directory doesn't exist.  This means that we'd
     *   need some other way to ensure that we're not writing to the same path where
     *   some other job wrote.
     */
    Path partitionFile;
    Random rnd = new Random();
    do {
        partitionFile = new Path(parser.getOutputPath().getParent(),
                String.format("_partition.lst.%012d", Math.abs(rnd.nextLong())));
    } while (fs.exists(partitionFile)); // this is still subject to a race condition between it and another instance of this program
    partitionFile = partitionFile.makeQualified(fs);
    LOG.info("partition file path: " + partitionFile);

    URI partitionUri = new URI(partitionFile.toString() + "#" + PARTITION_SYMLINK);
    LOG.debug("partitionUri for distributed cache: " + partitionUri);

    // input paths
    for (Path p : parser.getInputPaths())
        TsvInputFormat.addInputPath(job, p);

    LOG.info("sampling input");
    TextSampler.writePartitionFile(new TsvInputFormat(), job, partitionFile);
    LOG.info("created partitions");
    try {
        DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
        DistributedCache.createSymlink(job.getConfiguration());

        int retcode = job.waitForCompletion(true) ? 0 : 1;
        LOG.info("done");
        return retcode;
    } finally {
        LOG.debug("deleting partition file " + partitionFile);
        fs.delete(partitionFile, false);
    }
}

From source file:it.crs4.seal.usort.USort.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // defaults/*from  ww  w. j ava 2  s. c om*/
    conf.set(SealToolParser.INPUT_FORMAT_CONF, USortOptionParser.InputFormatDefault);
    conf.set(SealToolParser.OUTPUT_FORMAT_CONF, USortOptionParser.OutputFormatDefault);

    // parse command line
    USortOptionParser parser = new USortOptionParser();
    parser.parse(conf, args);

    Job job = new Job(conf, "USort " + parser.getInputPaths().get(0));
    job.setJarByClass(USort.class);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
    job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName()));

    job.setMapperClass(Demux.Map.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(SequencedFragment.class);

    job.setPartitionerClass(USortPartitioner.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SequencedFragment.class);

    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    boolean result = job.waitForCompletion(true);

    if (!result) {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    } else
        return 0;
}