Example usage for org.apache.hadoop.mapred JobConf getNumReduceTasks

List of usage examples for org.apache.hadoop.mapred JobConf getNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getNumReduceTasks.

Prototype

public int getNumReduceTasks() 

Source Link

Document

Get the configured number of reduce tasks for this job.

Usage

From source file:bixo.examples.JDBCCrawlWorkflow.java

License:Open Source License

public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent,
        FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug,
        String persistentDbLocation) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = conf.getNumReduceTasks() * HadoopUtils.getTaskTrackers(conf);
    FileSystem fs = curLoopDirPath.getFileSystem(conf);

    if (!fs.exists(inputDir)) {
        throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir));
    }/*from   w w w .ja  v a  2 s.  c om*/

    Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation);

    // Read _everything_ in initially
    // Split that pipe into URLs we want to fetch for the fetch pipe
    Pipe importPipe = new Pipe("url importer");
    importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD));
    importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS);

    Path contentPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString());

    Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString());

    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // NOTE: The source and sink for CrawlDbDatums is essentially the same database -
    // since cascading doesn't allow you to use the same tap for source and 
    // sink we fake it by creating two separate taps.
    Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation);

    // Create the sub-assembly that runs the fetch job
    BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers);

    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());

    // Take content and split it into content output plus parse to extract URLs.
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser());
    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));

    // Take status and output updated UrlDatum's. Again, since we are using
    // the same database we need to create a new tap.
    Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe());
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());

    // Now we need to join the URLs we get from parsing content with the
    // URLs we got from the status output, so we have a unified stream
    // of all known URLs.
    Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe),
            new Fields(UrlDatum.URL_FN));
    urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(outputPipe.getName(), urlSink);

    // Finally we can run it.
    FlowConnector flowConnector = new FlowConnector(
            HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf));
    return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(),
            parsePipe.getTailPipe(), outputPipe);

}

From source file:bixo.examples.SimpleCrawlWorkflow.java

License:Open Source License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = conf.getNumReduceTasks() * HadoopUtils.getTaskTrackers(conf);
    Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(),
            conf);//  w w  w .  j a v a2  s .  co m
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you 
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the 
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    SimpleParser parser = new SimpleParser();
    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch");
    urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(),
            outputPipe);

    return flow;
}

From source file:cascading.flow.hadoop.HadoopFlowStep.java

License:Open Source License

public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) {
    JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig);

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH)));

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapRunnerClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(flowProcess, conf);//from ww w . j a  va2  s.c  o m

    initFromSink(flowProcess, conf);

    initFromTraps(flowProcess, conf);

    initFromStepConfigDef(conf);

    int numSinkParts = getSink().getScheme().getNumSinkParts();

    if (numSinkParts != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(numSinkParts);
        else
            conf.setNumMapTasks(numSinkParts);
    } else if (getGroup() != null) {
        int gatherPartitions = conf.getNumReduceTasks();

        if (gatherPartitions == 0)
            gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0);

        if (gatherPartitions == 0)
            throw new FlowException(getName(),
                    "a default number of gather partitions must be set, see FlowRuntimeProps");

        conf.setNumReduceTasks(gatherPartitions);
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);
        conf.setPartitionerClass(GroupingPartitioner.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this,
                    getGroup());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingSortingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    String versionString = Version.getRelease();

    if (versionString != null)
        conf.set("cascading.version", versionString);

    conf.set(CASCADING_FLOW_STEP_ID, getID());
    conf.set("cascading.flow.step.num", Integer.toString(getOrdinal()));

    HadoopUtil.setIsInflow(conf);

    Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator();

    String mapState = pack(iterator.next(), conf);
    String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf);

    // hadoop 20.2 doesn't like dist cache when using local mode
    int maxSize = Short.MAX_VALUE;

    int length = mapState.length() + reduceState.length();

    if (isHadoopLocalMode(conf) || length < maxSize) // seems safe
    {
        conf.set("cascading.flow.step.node.map", mapState);

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce", reduceState);
    } else {
        conf.set("cascading.flow.step.node.map.path",
                HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState));

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce.path",
                    HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState));
    }

    return conf;
}

From source file:cascading.flow.hadoop.HadoopStepStats.java

License:Open Source License

public void captureJobStats() {
    RunningJob runningJob = getRunningJob();

    if (runningJob == null)
        return;//from  w w  w.java  2  s. c om

    JobConf ranJob = new JobConf(runningJob.getJobFile());

    setNumMapTasks(ranJob.getNumMapTasks());
    setNumReducerTasks(ranJob.getNumReduceTasks());
}

From source file:colossal.pipe.ColHadoopMapper.java

License:Apache License

@SuppressWarnings("unchecked")
public void configure(JobConf conf) {
    this.mapper = ReflectionUtils.newInstance(conf.getClass(ColPhase.MAPPER, BaseMapper.class, ColMapper.class),
            conf);//  w ww . j a va 2 s.  co  m
    this.isMapOnly = conf.getNumReduceTasks() == 0;
    try {
        this.out = (OUT) ReflectionUtils
                .newInstance(conf.getClass(ColPhase.MAP_OUT_CLASS, Object.class, Object.class), conf);
        this.schema = ColPhase.getSchema(this.out);
        this.groupBy = conf.get(ColPhase.GROUP_BY);
        this.sortBy = conf.get(ColPhase.SORT_BY);
        if (conf.getInputFormat() instanceof TextInputFormat) {
            Class<?> inClass = conf.getClass(ColPhase.MAP_IN_CLASS, Object.class, Object.class);
            if (inClass == String.class) {
                isStringInput = true;
            } else if (inClass == Text.class) {
                isTextInput = true;
            } else {
                isJsonInput = true;
                inSchema = ColPhase.getSchema((IN) ReflectionUtils.newInstance(inClass, conf));
            }
        }
    } catch (Exception e) {
        if (e instanceof RuntimeException)
            throw (RuntimeException) e;
        throw new RuntimeException(e);
    }

    mapper.setConf(conf);
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *///w w  w.  j  a  v  a 2 s  .c  o m
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.alexholmes.hadooputils.sort.SortInputSampler.java

License:Apache License

public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler) throws IOException {
    Configuration conf = job;//from w  w  w  .  ja  v a  2  s  .  co  m
    // Use the input format defined in the job. NOT, the one provided by
    // the parent class's writePartitionFile() method, which will be a plain
    // TextInputFormat, by default
    final InputFormat inf = job.getInputFormat();
    int numPartitions = job.getNumReduceTasks();
    K[] samples = (K[]) sampler.getSample(inf, job);
    RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:com.benchmark.mapred.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param conf the job to sample/*  w w  w.  j  a  v  a 2  s . c  om*/
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        while (reader.next(key, value)) {
            sampler.addKey(key);
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

From source file:com.datasalt.utils.viewbuilder.ShardedSolrDocumentConverter.java

License:Apache License

@Override
@SuppressWarnings({ "rawtypes", "unchecked" })
public void setConf(Configuration conf) {
    super.setConf(conf);
    if (conf != null) {
        JobConf jobConf = (JobConf) conf;
        numShards = jobConf.getNumReduceTasks();
        shardNumberShift = (int) Math.ceil(numDigits(Long.MAX_VALUE)) - 1
                - (int) Math.ceil(numDigits(numShards));
        log.info("Num shards : " + numShards);

        Class partitionerClass;//from   ww  w  . jav a 2s.c  o m
        try {
            partitionerClass = Class.forName(jobConf.get(PARTITIONER_CONF));
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
            throw new RuntimeException(
                    "Partitioner not set.Use conf.set(ShardedSolrDocument.PARTITIONER_CONF,partitionerClassName)",
                    e);
        }
        log.info("Partitioner class : " + partitionerClass);
        partitioner = (Partitioner<KEY_TYPE, VALUE_TYPE>) ReflectionUtils.newInstance(partitionerClass, conf);
    }
}

From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java

License:Apache License

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link/*w  w  w. ja va2s. c  om*/
org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void configure(JobConf job) {
    try {
        String parts = getPartitionFile(job);
        final Path partFile = new Path(parts);
        final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(job) // assume in DistributedCache
                : partFile.getFileSystem(job);

        //Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
        K[] splitPoints = readPartitions(fs, partFile, (Class<K>) Tuple.class, job);
        if (splitPoints.length != job.getNumReduceTasks() - 1) {
            throw new IOException("Wrong number of partitions in keyset");
        }
        RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
        for (int i = 0; i < splitPoints.length - 1; ++i) {
            if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                throw new IOException("Split points are out of order");
            }
        }
        boolean natOrder = job.getBoolean("total.order.partitioner.natural.order", true);
        if (natOrder && BinaryComparable.class.isAssignableFrom(Tuple.class)) {
            partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                    job.getInt("total.order.partitioner.max.trie.depth", 2));
        } else {
            partitions = new BinarySearchNode(splitPoints, comparator);
        }
    } catch (IOException e) {
        throw new IllegalArgumentException("Can't read partitions file", e);
    }
}