Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:io.druid.indexer.IndexGeneratorJob.java

License:Apache License

public boolean run() {
    try {/* www.j  a v  a  2  s. c o m*/
        Job job = Job.getInstance(new Configuration(),
                String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.23");

        JobHelper.injectSystemProperties(job);
        config.addJobProperties(job);

        job.setMapperClass(IndexGeneratorMapper.class);
        job.setMapOutputValueClass(BytesWritable.class);

        SortableBytes.useSortableBytesAsMapOutputKey(job);

        int numReducers = Iterables.size(config.getAllBuckets().get());
        if (numReducers == 0) {
            throw new RuntimeException("No buckets?? seems there is no data to index.");
        }

        if (config.getSchema().getTuningConfig().getUseCombiner()) {
            job.setCombinerClass(IndexGeneratorCombiner.class);
            job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class);
        }

        job.setNumReduceTasks(numReducers);
        job.setPartitionerClass(IndexGeneratorPartitioner.class);

        setReducerClass(job);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(IndexGeneratorOutputFormat.class);
        FileOutputFormat.setOutputPath(job, config.makeIntermediatePath());

        config.addInputPaths(job);

        // hack to get druid.processing.bitmap property passed down to hadoop job.
        // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig.
        final String bitmapProperty = "druid.processing.bitmap.type";
        final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty);
        if (bitmapType != null) {
            for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) {
                // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above
                String value = Strings.nullToEmpty(job.getConfiguration().get(property));
                job.getConfiguration().set(property,
                        String.format("-D%s=%s %s", bitmapProperty, bitmapType, value));
            }
        }

        config.intoConfiguration(job);

        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), job);

        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());

        boolean success = job.waitForCompletion(true);

        Counter invalidRowCount = job.getCounters()
                .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER);
        jobStats.setInvalidRowCount(invalidRowCount.getValue());

        return success;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:io.fluo.stress.trie.Init.java

License:Apache License

private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Init.class);

    job.setJobName(Init.class.getName() + "_load");

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
    job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));

    job.setMapperClass(InitMapper.class);
    job.setCombinerClass(InitCombiner.class);
    job.setReducerClass(InitReducer.class);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);

    job.setPartitionerClass(RangePartitioner.class);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    Connector conn = AccumuloUtil.getConnector(props);

    Path splitsPath = new Path(tmp, "splits.txt");

    Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);

    RangePartitioner.setSplitFile(job, splitsPath.toString());
    job.setNumReduceTasks(splits1.size() + 1);

    Path outPath = new Path(tmp, "out");
    AccumuloFileOutputFormat.setOutputPath(job, outPath);

    boolean success = job.waitForCompletion(true);

    if (success) {
        Path failPath = new Path(tmp, "failures");
        fs.mkdirs(failPath);//  w ww  .  jav  a2  s . c om
        conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(),
                failPath.toString(), false);
    }
    return success ? 0 : 1;
}

From source file:ipldataanalysis3.IPLDataAnalysis3.java

@Override
public int run(String[] args) throws Exception {

    if (args.length != 2) {
        System.out.printf("Two parameters are required for Data Analysis for IPL- <input dir> <output dir>\n");
        return -1;
    }//from   ww w  . j a  va 2s . c  o  m

    Job job = new Job(getConf(), "Job1");
    job.setJarByClass(IPLDataAnalysis3.class);
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setMapperClass(DataAnalysisMapper.class);
    job.setNumReduceTasks(13);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setPartitionerClass(DataAnalysisPartitioner.class);
    job.setReducerClass(DataAnalysisReducer.class);
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;

}

From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java

License:Apache License

private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        job.setMapperClass(PipesMapper.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, job.getPartitionerClass());
        job.setPartitionerClass(PipesPartitioner.class);
    }//from  ww w .  ja v  a  2 s  . co  m
    if (!getIsJavaReducer(conf)) {
        job.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            job.setOutputFormatClass(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
        job.setInputFormatClass(PipesNonJavaInputFormat.class);
    }

    if (avroInput != null) {
        if (explicitInputFormat) {
            conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
        } // else let the bridge fall back to the appropriate Avro IF
        switch (avroInput) {
        case K:
            job.setInputFormatClass(PydoopAvroInputKeyBridge.class);
            break;
        case V:
            job.setInputFormatClass(PydoopAvroInputValueBridge.class);
            break;
        case KV:
            job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class);
            break;
        default:
            throw new IllegalArgumentException("Bad Avro input type");
        }
    }
    if (avroOutput != null) {
        if (explicitOutputFormat) {
            conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class);
        } // else let the bridge fall back to the appropriate Avro OF
        conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name());
        switch (avroOutput) {
        case K:
            job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class);
            break;
        case V:
            job.setOutputFormatClass(PydoopAvroOutputValueBridge.class);
            break;
        case KV:
            job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class);
            break;
        default:
            throw new IllegalArgumentException("Bad Avro output type");
        }
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        String msg = "No application program defined.";
        throw new IllegalArgumentException(msg);
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    //FIXME: this is kind of useless if the pipes program is not in c++
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        String msg = "Problem parsing executable URI " + exec;
        IOException ie = new IOException(msg);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();//from   w  w w  . ja  v  a  2 s.c  o m
        return 1;
    }
    try {
        Job job = new Job(new Configuration());
        job.setJobName(getClass().getName());
        Configuration conf = job.getConfiguration();
        CommandLine results = cli.parse(conf, args);
        if (results.hasOption("input")) {
            Path path = new Path(results.getOptionValue("input"));
            FileInputFormat.setInputPaths(job, path);
        }
        if (results.hasOption("output")) {
            Path path = new Path(results.getOptionValue("output"));
            FileOutputFormat.setOutputPath(job, path);
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            explicitInputFormat = true;
            setIsJavaRecordReader(conf, true);
            job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(conf, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(conf, true);
            job.setMapperClass(getClass(results, "map", conf, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(conf, true);
            job.setReducerClass(getClass(results, "reduce", conf, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            explicitOutputFormat = true;
            setIsJavaRecordWriter(conf, true);
            job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class));
        }
        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormatClass());
            }
        }
        if (results.hasOption("avroInput")) {
            avroInput = AvroIO.valueOf(results.getOptionValue("avroInput").toUpperCase());
        }
        if (results.hasOption("avroOutput")) {
            avroOutput = AvroIO.valueOf(results.getOptionValue("avroOutput").toUpperCase());
        }

        if (results.hasOption("program")) {
            setExecutable(conf, results.getOptionValue("program"));
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() };
            // FindBugs complains that creating a URLClassLoader should be
            // in a doPrivileged() block.
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            conf.setClassLoader(loader);
        }
        setupPipesJob(job);
        return job.waitForCompletion(true) ? 0 : 1;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }
}

From source file:it.crs4.seal.demux.Demux.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    LOG.info("starting");

    Configuration conf = getConf();
    DemuxOptionParser parser = new DemuxOptionParser();
    parser.parse(conf, args);//from w w w . j a va2s  .co  m

    conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads());
    conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads());

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");
    if (parser.getNoIndexReads())
        LOG.info("Not expecting to find any index reads.  Will demultiplex based only on lane.");

    // load sample sheet to fail early in case of problems
    DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf);

    // must be called before creating the job, since the job
    // *copies* the Configuration.
    distributeSampleSheet(parser.getSampleSheetPath());

    // Create a Job using the processed conf
    Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0)));

    job.setJarByClass(Demux.class);

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq")));

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(SequencedFragment.class);

    job.setPartitionerClass(SequenceIdLocationPartitioner.class);
    job.setGroupingComparatorClass(GroupByLocationComparator.class);
    job.setSortComparatorClass(TwoOneThreeSortComparator.class);

    job.setReducerClass(Red.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SequencedFragment.class);

    // output
    job.setOutputFormatClass(DemuxOutputFormat.class);
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        if (parser.getCreateLaneContent())
            createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath());
        return 0;
    } else {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    }
}

From source file:it.crs4.seal.prq.PairReadsQSeq.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // defaults//from   w  w  w  .java 2s. c o  m
    conf.set(PrqOptionParser.INPUT_FORMAT_CONF, PrqOptionParser.InputFormatDefault);

    // parse command line
    PrqOptionParser parser = new PrqOptionParser();
    parser.parse(conf, args);

    Job job = new Job(conf, "PairReadsQSeq " + parser.getInputPaths().get(0));
    job.setJarByClass(PairReadsQSeq.class);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
    job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName("prq")));

    job.setMapperClass(PrqMapper.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(GroupByLocationComparator.class);

    job.setReducerClass(PrqReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReadPair.class);

    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:it.crs4.seal.read_sort.ReadSort.java

License:Open Source License

public int run(String[] args) throws Exception {
    LOG.info("starting");

    Configuration conf = getConf();

    ReadSortOptionParser parser = new ReadSortOptionParser();
    parser.parse(conf, args);/*from w  w w.  j  a v  a  2  s.c o m*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    // Create a Job using the processed conf
    Job job = new Job(conf, makeJobName(parser.getInputPaths().get(0)));
    job.setJarByClass(ReadSort.class);

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setMapperClass(ReadSortSamMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(WholeReferencePartitioner.class);

    job.setReducerClass(ReadSortSamReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // output path
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        return 0;
    } else {
        LOG.fatal("ReadSort failed!");
        return 1;
    }
}

From source file:it.crs4.seal.tsv_sort.TsvSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");

    TsvSortOptionParser parser = new TsvSortOptionParser();
    parser.parse(getConf(), args);/*w  w  w. ja va2s  . c  o  m*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    Job job = new Job(getConf());

    job.setJobName("TsvSort " + parser.getInputPaths().get(0));
    job.setJarByClass(TsvSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TsvInputFormat.class);
    job.setOutputFormatClass(TextValueOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);

    // output path
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    FileSystem fs = parser.getOutputPath().getFileSystem(job.getConfiguration());
    /*
     *
     * Pick a random name for the partition file in the same directory as the
     * output path.  So, TsvSort /user/me/input /user/me/output
     * results in the partition file being placed in /user/me/_partition.lst.12340921387402174
     *
     * Why not place it directly in the input path?
     *
     *   We wouldn't be able to run two sorts on the same data at the same time.
     *   We've received complaints about this in the past, so it has been a
     *   limit in practice.
     *
     * Why not place it directly in the output path?
     *
     *   We'd have to create the output path before the output format did.
     *   For this to work we'd have to disable the FileOutputFormat's default check
     *   that verifies that the output directory doesn't exist.  This means that we'd
     *   need some other way to ensure that we're not writing to the same path where
     *   some other job wrote.
     */
    Path partitionFile;
    Random rnd = new Random();
    do {
        partitionFile = new Path(parser.getOutputPath().getParent(),
                String.format("_partition.lst.%012d", Math.abs(rnd.nextLong())));
    } while (fs.exists(partitionFile)); // this is still subject to a race condition between it and another instance of this program
    partitionFile = partitionFile.makeQualified(fs);
    LOG.info("partition file path: " + partitionFile);

    URI partitionUri = new URI(partitionFile.toString() + "#" + PARTITION_SYMLINK);
    LOG.debug("partitionUri for distributed cache: " + partitionUri);

    // input paths
    for (Path p : parser.getInputPaths())
        TsvInputFormat.addInputPath(job, p);

    LOG.info("sampling input");
    TextSampler.writePartitionFile(new TsvInputFormat(), job, partitionFile);
    LOG.info("created partitions");
    try {
        DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
        DistributedCache.createSymlink(job.getConfiguration());

        int retcode = job.waitForCompletion(true) ? 0 : 1;
        LOG.info("done");
        return retcode;
    } finally {
        LOG.debug("deleting partition file " + partitionFile);
        fs.delete(partitionFile, false);
    }
}

From source file:it.crs4.seal.usort.USort.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // defaults/*from  ww  w. j ava 2  s. c om*/
    conf.set(SealToolParser.INPUT_FORMAT_CONF, USortOptionParser.InputFormatDefault);
    conf.set(SealToolParser.OUTPUT_FORMAT_CONF, USortOptionParser.OutputFormatDefault);

    // parse command line
    USortOptionParser parser = new USortOptionParser();
    parser.parse(conf, args);

    Job job = new Job(conf, "USort " + parser.getInputPaths().get(0));
    job.setJarByClass(USort.class);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
    job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName()));

    job.setMapperClass(Demux.Map.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(SequencedFragment.class);

    job.setPartitionerClass(USortPartitioner.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SequencedFragment.class);

    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    boolean result = job.waitForCompletion(true);

    if (!result) {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    } else
        return 0;
}