Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator.java

License:Open Source License

protected Job createJob(String[] args) throws Exception {

    Job job = new Job();
    job.setJobName("ArchiveCDXGenerator" + "_" + System.currentTimeMillis());

    Configuration conf = job.getConfiguration();

    this.setup(args, conf);

    Path input = new Path(this.inputPath);
    FileInputFormat.addInputPath(job, input);
    Path outputPath = new Path(this.outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setInputFormatClass(ArchiveToCDXFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    conf.set("cdx.format", this.cdxFormat);
    conf.set("cdx.hdfs", Boolean.toString(this.hdfs));
    conf.set("cdx.metatag", this.metaTag);
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // General config:
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(this.numReducers);
    job.setJarByClass(ArchiveCDXGenerator.class);

    // POST directly to the tinycdxserver:
    if (this.cdxserver != null) {
        conf.set("tinycdxserver.endpoint", this.cdxserver);
        conf.setInt("tinycdxserver.batch_size", this.cdxserver_batch_size);
        // Perform the update in the Map phase (difficult to control number
        // of clients)
        // job.setMapperClass(TinyCDXServerMapper.class);
        // job.setReducerClass(Reducer.class);
        // Perform the update in the reduce phase:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(TinyCDXServerReducer.class);
    } else {//  w  w w.  j  a  va  2s  . c  om
        // Default to the pass-through mapper and reducer:
        job.setMapperClass(Mapper.class);
        job.setReducerClass(Reducer.class);
        // Set up the split:
        if (this.splitFile != null) {
            log.info("Setting splitFile to " + this.splitFile);
            AlphaPartitioner.setPartitionPath(conf, this.splitFile);
            job.setPartitionerClass(AlphaPartitioner.class);
        } else {
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),
                    new Path(outputPath, "_partitions.lst"));
            // FIXME This probably won't work - need to update to recent API
            JobConf jc = new JobConf(conf);
            InputSampler.writePartitionFile(jc, new InputSampler.RandomSampler(1, 10000));
        }
    }

    FileSystem fs = input.getFileSystem(conf);
    FileStatus inputStatus = fs.getFileStatus(input);
    FileInputFormat.setMaxInputSplitSize(job,
            inputStatus.getLen() / getNumMapTasks(new Path(this.inputPath), conf));
    return job;
}

From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerGenerateSplitsFile(final Job job, final AddElementsFromHdfs operation,
        final AccumuloStore store) throws IOException {
    final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE_PATH);
    LOGGER.info("Creating splits file in location {} from table {}", splitsFilePath,
            store.getProperties().getTable());
    final int maxReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS);
    final int minReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    if (maxReducers != -1 && minReducers != -1) {
        if (minReducers > maxReducers) {
            LOGGER.error(/*from   ww  w  .  j a v  a  2  s .co m*/
                    "Minimum number of reducers must be less than the maximum number of reducers: minimum was {} "
                            + "maximum was {}",
                    minReducers, maxReducers);
            throw new IOException(
                    "Minimum number of reducers must be less than the maximum number of reducers");
        }
    }
    int numSplits;
    try {
        if (maxReducers == -1) {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath));
        } else {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath), maxReducers - 1);
        }
    } catch (final StoreException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    int numReducers = numSplits + 1;
    LOGGER.info("Number of splits is {}; number of reducers is {}", numSplits, numReducers);
    // If neither min or max are specified then nothing to do; if max specified and min not then already taken care of.
    // If min is specified and the number of reducers is not greater than that then set the appropriate number of
    // subbins.
    if (minReducers != -1) {
        if (numReducers < minReducers) {
            LOGGER.info("Number of reducers is {} which is less than the specified minimum number of {}",
                    numReducers, minReducers);
            int factor = (minReducers / numReducers) + 1;
            LOGGER.info("Setting number of subbins on KeyRangePartitioner to {}", factor);
            KeyRangePartitioner.setNumSubBins(job, factor);
            numReducers = numReducers * factor;
            LOGGER.info("Number of reducers is {}", numReducers);
        }
    }
    job.setNumReduceTasks(numReducers);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerFromUserProvidedSplitsFile(final Job job, final AddElementsFromHdfs operation)
        throws IOException {
    final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE_PATH);
    if (intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS) != -1
            || intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS) != -1) {
        LOGGER.info("Using splits file provided by user {}, ignoring options {} and {}", splitsFilePath,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    } else {//from ww w . jav  a  2s . c om
        LOGGER.info("Using splits file provided by user {}", splitsFilePath);
    }
    final int numSplits = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()),
            new Path(splitsFilePath));
    job.setNumReduceTasks(numSplits + 1);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:validpartitionnacluster.ValidPartitionNacluster.java

public int run(String[] allArgs) throws Exception {
    String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();

    Job job = Job.getInstance(getConf());

    job.setJarByClass(ValidPartitionNacluster.class); // necessrio(corrigido)

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(PartitionMapper.class);
    job.setPartitionerClass(AgePartitioner.class);
    job.setReducerClass(ParitionReducer.class);

    //Number of Reducer tasks.
    job.setNumReduceTasks(4);/*from  www  . j a  va 2  s.c  om*/

    job.setJobName("Partition for Machine Count");
    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(job, new Path(args[0]));
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job/*  ww w  . j  a  v a  2 s  . c  om*/
 */
public void build() {
    try {
        Job job = new Job(config);
        job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        job.getConfiguration().set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        job.getConfiguration().setBoolean("save.keys", saveKeys);
        job.getConfiguration().set("final.output.dir", outputDir.toString());
        job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType));
        job.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
        job.setMapperClass(mapperClass);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(BytesWritable.class);
        job.setReducerClass(HadoopStoreBuilderReducer.class);
        job.setInputFormatClass(inputFormatClass);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setJarByClass(getClass());

        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration());
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration());
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        int numChunks = Math.max(
                (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes),
                1);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes
                + ",  num.chunks = " + numChunks);
        job.getConfiguration().setInt("num.chunks", numChunks);
        int numReduces = cluster.getNumberOfNodes() * numChunks;
        job.setNumReduceTasks(numReduces);
        logger.info("Number of reduces: " + numReduces);

        logger.info("Building store...");
        job.waitForCompletion(true);

        ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();
        if (saveKeys)
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
        else
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {
            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());
            if (!outputFs.exists(nodePath)) {
                outputFs.mkdirs(nodePath); // Create empty folder
            }

            // Write metadata
            FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata"));
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();
        }

        if (checkSumType != CheckSumType.NONE) {

            // Generate checksum for every node
            FileStatus[] nodes = outputFs.listStatus(outputDir);

            // Do a CheckSumOfCheckSum - Similar to HDFS
            CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
            if (checkSumGenerator == null) {
                throw new VoldemortException("Could not generate checksum digests");
            }

            for (FileStatus node : nodes) {
                if (node.isDir()) {
                    FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() {

                        public boolean accept(Path arg0) {
                            if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                                return true;
                            }
                            return false;
                        }
                    });

                    if (storeFiles != null) {
                        Arrays.sort(storeFiles, new IndexFileLastComparator());
                        for (FileStatus file : storeFiles) {
                            FSDataInputStream input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            checkSumGenerator.update(fileCheckSum);
                            outputFs.delete(file.getPath(), true);
                        }
                        FSDataOutputStream checkSumStream = outputFs.create(
                                new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt"));
                        checkSumStream.write(checkSumGenerator.getCheckSum());
                        checkSumStream.flush();
                        checkSumStream.close();

                    }
                }
            }
        }
    } catch (Exception e) {
        logger.error("Error = " + e);
        throw new VoldemortException(e);
    }

}