Example usage for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner DEFAULT

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner DEFAULT_PATH.

Prototype

String DEFAULT_PATH

To view the source code for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner DEFAULT_PATH.

Click Source Link

Usage

From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This/*from  ww  w  .j  a  v a2 s . c  o  m*/
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match HFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     PutSortReducer)</li>
 * </ul>
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    Class<? extends Partitioner> topClass;
    try {
        topClass = getTotalOrderPartitionerClass();
    } catch (ClassNotFoundException e) {
        throw new IOException("Failed getting TotalOrderPartitioner", e);
    }
    //partition
    job.setPartitionerClass(topClass);
    //Set the key class for the job output data
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    //Set the value class for job outputs
    job.setOutputValueClass(KeyValue.class);
    //outputformatHfile
    job.setOutputFormatClass(HFileOutputFormat2.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(SingleColumnReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    //?regionstarkey
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");

    //?region?reduce?
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);

    URI cacheUri;
    try {
        // Below we make explicit reference to the bundled TOP.  Its cheating.
        // We are assume the define in the hbase bundled TOP is as it is in
        // hadoop (whether 0.20 or 0.22, etc.)
        /*
          cacheUri = new URI(partitionsPath.toString() + "#" +
            org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH);
            */
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    // Set compression algorithms based on column families
    configureCompression(table, conf);

    TableMapReduceUtil.addDependencyJars(job);
    LOG.info("Incremental table output configured.");
}

From source file:com.moz.fiji.mapreduce.output.HFileMapReduceJobOutput.java

License:Apache License

/**
 * Configures the partitioner for generating HFiles.
 *
 * <p>Each generated HFile should fit within a region of of the target table.
 * Additionally, it's optimal to have only one HFile to load into each region, since a
 * read from that region will require reading from each HFile under management (until
 * compaction happens and merges them all back into one HFile).</p>
 *
 * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
 * records output from the Mapper based on their rank in a total ordering of the
 * keys.  The <code>startKeys</code> argument should contain a list of the first key in
 * each of those partitions.</p>/*from  w w w .j a  v  a  2s  . c om*/
 *
 * @param job The job to configure.
 * @param startKeys A list of keys that will mark the boundaries between the partitions
 *     for the sorted map output records.
 * @throws IOException If there is an error.
 */
public static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException {
    FijiMRPlatformBridge.get().setTotalOrderPartitionerClass(job);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
        final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
        if (job == null) {
            return -1;
        }//w  ww .  j  a v  a  2 s.c  o m

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1,
                10000, 10);

        InputSampler.writePartitionFile(job, sampler);

        // Add to DistributedCache
        Configuration conf = job.getConfiguration();
        String partitionFile = TotalOrderPartitioner.getPartitionFile(conf);
        URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(partitionUri, conf);
        DistributedCache.createSymlink(conf);

        return job.waitForCompletion(true) ? 0 : 1;
    }

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJob()
        throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;//from   ww  w.  j  av a2 s .c  o m
    BufferedWriter bufferedWriter;

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);
        }
    }

    // Sample the SequenceInputFormat to do TotalSort and create final output
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("samples = " + this.conf.getDictionarySamplesPath());
    System.out.println("output = " + this.conf.getDictionaryOutputPath());

    FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    // Identity Mapper
    // job.setMapperClass(Mapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJob()
        throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Job job = null;/*from ww w .ja  va 2 s  .c o m*/
    boolean jobOK;

    // if triples output path exists...
    if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
        if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
            this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
            System.out.println("Select other path or use option -dt to overwrite");
            System.exit(-1);
        }
    }

    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    return jobOK;
}

Example usage for org.apache.hadoop.mapreduce.lib.partition TotalOrderPartitioner DEFAULT_PATH

Introduction

Prototype

Usage