Example usage for org.apache.hadoop.mapreduce JobContext getNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce JobContext getNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getNumReduceTasks.

Prototype

public int getNumReduceTasks();

Source Link

Document

Get configured the number of reduce tasks for this job.

Usage

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample/*  w  ww.j  a  v a 2s.  c  o m*/
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) {
    final Configuration hadoopConfiguration = context.getConfiguration();
    final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration);
    List<InputSplit> splits = InfiniteMongoSplitter.calculateSplits(conf);

    if (conf.getSelfMerge() != null) {
        //check if we need to grab existing records and add them to the splits
        final Configuration existingConfiguration = context.getConfiguration();
        existingConfiguration.set("mongo.input.uri", conf.getSelfMerge());
        BasicDBObject query = new BasicDBObject();
        //add on this query to only get items previous to now if no reducer is specified (otherwise
        //we will leak any items we map on the first run back in before this split runs)
        if (context.getNumReduceTasks() == 0)
            query.put("_id", new BasicDBObject(MongoDbManager.lt_, new ObjectId()));
        existingConfiguration.set("mongo.input.query", query.toString());
        final InfiniteMongoConfig existingConf = new InfiniteMongoConfig(existingConfiguration);
        splits.addAll(InfiniteMongoSplitter.calculateSplits(existingConf));
    }/*from w  ww. j a va 2s . c  o  m*/
    return splits;
}

From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * //from  ww  w. j  a  v  a  2s  . c o  m
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:info.halo9pan.word2vec.hadoop.mr.SortInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * //  ww  w .  j a va 2  s  .c  om
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final SortInputFormat inFormat = new SortInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample/*from  w  w w  .java  2  s.  co  m*/
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            sampler.addKey(reader.getCurrentKey());
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

From source file:org.apache.blur.mapreduce.lib.BlurOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
    CheckOutputSpecs.checkOutputSpecs(context.getConfiguration(), context.getNumReduceTasks());
}

From source file:org.apache.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample/*from  w w  w.  j  av a2s .c o m*/
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(),
            TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(
            conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS),
            splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:org.bigsolr.hadoop.SolrInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    log.info("SolrInputFormat -> getSplits");

    Configuration conf = context.getConfiguration();
    String collectionName = conf.get(COLLECTION_NAME);
    int numSplits = context.getNumReduceTasks();
    SolrServer solr = SolrOperations.getSolrServer(conf);

    final SolrQuery solrQuery = new SolrQuery(conf.get(SOLR_QUERY));
    solrQuery.setFields(ID_FIELD);//from  w  ww  .ja  v  a2 s . c o m
    solrQuery.setRows(50);
    solrQuery.set("collection", collectionName);
    solrQuery.setStart(0);

    QueryResponse response;
    try {
        response = solr.query(solrQuery);
    } catch (final SolrServerException e) {
        throw new IOException(e);
    }

    int numResults = (int) response.getResults().getNumFound();
    int numDocsPerSplit = (numResults / numSplits);
    int currentDoc = 0;

    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < numSplits - 1; i++) {
        splits.add(new SolrInputSplit(currentDoc, numDocsPerSplit));
        currentDoc += numDocsPerSplit;
    }
    splits.add(new SolrInputSplit(currentDoc, numResults - currentDoc));

    return splits;
}