Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java

License:Open Source License

protected Job createJob(String[] args) throws Exception {

    Job job = new Job();
    job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis());

    Configuration conf = job.getConfiguration();

    this.setup(args, conf);

    Path input = new Path(this.inputPath);
    FileInputFormat.addInputPath(job, input);
    Path outputPath = new Path(this.outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setInputFormatClass(ArchiveToCDXFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    conf.set("cdx.format", this.cdxFormat);
    conf.set("cdx.hdfs", Boolean.toString(this.hdfs));
    conf.set("cdx.metatag", this.metaTag);
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // General config:
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(this.numReducers);
    job.setJarByClass(ArchiveCDXGenerator.class);

    // POST directly to the tinycdxserver:
    if (this.cdxserver != null) {
        conf.set("tinycdxserver.endpoint", this.cdxserver);
        conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size);
        // Perform the update in the Map phase (difficult to control number
        // of clients)
        // job.setMapperClass(TinyCDXServerMapper.class);
        // job.setReducerClass(Reducer.class);
        // Perform the update in the reduce phase:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(TinyCDXServerReducer.class);
    } else {//  w  w w.  j  a  va  2s  . c  om
        // Default to the pass-through mapper and reducer:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(Reducer.class);
        // Set up the split:
        if (this.splitFile != null) {
            log.info("Setting splitFile to " + this.splitFile);
            AlphaPartitioner.setPartitionPath(conf, this.splitFile);
            job.setPartitionerClass(AlphaPartitioner.class);
        } else {
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),
                    new Path(outputPath, "_partitions.lst"));
            // FIXME This probably won't work - need to update to recent API
            JobConf jc = new JobConf(conf);
            InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000));
        }
    }

    FileSystem fs = input.getFileSystem(conf);
    FileStatus inputStatus = fs.getFileStatus(input);
    FileInputFormat.setMaxInputSplitSize(job,
            inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf));
    return job;
}

From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerGenerateSplitsFile(final Job job, final AddElementsFromHdfs operation,
        final AccumuloStore store) throws IOException {
    final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE_PATH);
    LOGGER.info("Creating splits file in location {} from table {}", splitsFilePath,
            store.getProperties().getTable());
    final int maxReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS);
    final int minReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    if (maxReducers != -1 && minReducers != -1) {
        if (minReducers > maxReducers) {
            LOGGER.error(/*from   ww  w  .  j a v  a  2  s .co m*/
                    "Minimum number of reducers must be less than the maximum number of reducers: minimum was {} "
                            + "maximum was {}",
                    minReducers, maxReducers);
            throw new IOException(
                    "Minimum number of reducers must be less than the maximum number of reducers");
        }
    }
    int numSplits;
    try {
        if (maxReducers == -1) {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath));
        } else {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath), maxReducers - 1);
        }
    } catch (final StoreException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    int numReducers = numSplits + 1;
    LOGGER.info("Number of splits is {}; number of reducers is {}", numSplits, numReducers);
    // If neither min or max are specified then nothing to do; if max specified and min not then already taken care of.
    // If min is specified and the number of reducers is not greater than that then set the appropriate number of
    // subbins.
    if (minReducers != -1) {
        if (numReducers < minReducers) {
            LOGGER.info("Number of reducers is {} which is less than the specified minimum number of {}",
                    numReducers, minReducers);
            int factor = (minReducers / numReducers) + 1;
            LOGGER.info("Setting number of subbins on KeyRangePartitioner to {}", factor);
            KeyRangePartitioner.setNumSubBins(job, factor);
            numReducers = numReducers * factor;
            LOGGER.info("Number of reducers is {}", numReducers);
        }
    }
    job.setNumReduceTasks(numReducers);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerFromUserProvidedSplitsFile(final Job job, final AddElementsFromHdfs operation)
        throws IOException {
    final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE_PATH);
    if (intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS) != -1
            || intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS) != -1) {
        LOGGER.info("Using splits file provided by user {}, ignoring options {} and {}", splitsFilePath,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    } else {//from ww w . jav  a  2s . c om
        LOGGER.info("Using splits file provided by user {}", splitsFilePath);
    }
    final int numSplits = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()),
            new Path(splitsFilePath));
    job.setNumReduceTasks(numSplits + 1);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:validpartitionnacluster.ValidPartitionNacluster.java

public int run(String[] allArgs) throws Exception {
    String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();

    Job job = Job.getInstance(getConf());

    job.setJarByClass(ValidPartitionNacluster.class); // necessrio(corrigido)

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(PartitionMapper.class);
    job.setPartitionerClass(AgePartitioner.class);
    job.setReducerClass(ParitionReducer.class);

    //Number of Reducer tasks.
    job.setNumReduceTasks(4);/*from  www  . j a  va 2  s.c  om*/

    job.setJobName("Partition for Machine Count");
    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(job, new Path(args[0]));
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job/*  ww w  . j  a  v a  2 s  . c  om*/
 */
public void build() {
    try {
        Job job = new Job(config);
        job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        job.getConfiguration().set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        job.getConfiguration().setBoolean("save.keys", saveKeys);
        job.getConfiguration().set("final.output.dir", outputDir.toString());
        job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType));
        job.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
        job.setMapperClass(mapperClass);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(BytesWritable.class);
        job.setReducerClass(HadoopStoreBuilderReducer.class);
        job.setInputFormatClass(inputFormatClass);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setJarByClass(getClass());

        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration());
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration());
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        int numChunks = Math.max(
                (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes),
                1);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes
                + ",  num.chunks = " + numChunks);
        job.getConfiguration().setInt("num.chunks", numChunks);
        int numReduces = cluster.getNumberOfNodes() * numChunks;
        job.setNumReduceTasks(numReduces);
        logger.info("Number of reduces: " + numReduces);

        logger.info("Building store...");
        job.waitForCompletion(true);

        ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();
        if (saveKeys)
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
        else
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {
            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());
            if (!outputFs.exists(nodePath)) {
                outputFs.mkdirs(nodePath); // Create empty folder
            }

            // Write metadata
            FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata"));
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();
        }

        if (checkSumType != CheckSumType.NONE) {

            // Generate checksum for every node
            FileStatus[] nodes = outputFs.listStatus(outputDir);

            // Do a CheckSumOfCheckSum - Similar to HDFS
            CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
            if (checkSumGenerator == null) {
                throw new VoldemortException("Could not generate checksum digests");
            }

            for (FileStatus node : nodes) {
                if (node.isDir()) {
                    FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() {

                        public boolean accept(Path arg0) {
                            if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                                return true;
                            }
                            return false;
                        }
                    });

                    if (storeFiles != null) {
                        Arrays.sort(storeFiles, new IndexFileLastComparator());
                        for (FileStatus file : storeFiles) {
                            FSDataInputStream input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            checkSumGenerator.update(fileCheckSum);
                            outputFs.delete(file.getPath(), true);
                        }
                        FSDataOutputStream checkSumStream = outputFs.create(
                                new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt"));
                        checkSumStream.write(checkSumGenerator.getCheckSum());
                        checkSumStream.flush();
                        checkSumStream.close();

                    }
                }
            }
        }
    } catch (Exception e) {
        logger.error("Error = " + e);
        throw new VoldemortException(e);
    }

}