Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();//from  w ww.  j  av a 2 s . c o m
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix");
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
    String similarityClassnameArg = parsedArgs.get("--similarityClassname");
    String distributedSimilarityClassname;
    try {
        distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg)
                .getSimilarityImplementationClassName();
    } catch (IllegalArgumentException iae) {
        distributedSimilarityClassname = similarityClassnameArg;
    }

    int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path weightsPath = new Path(tempDirPath, "weights");
    Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class,
                VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class,
                VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class);

        weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        weights.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class,
                CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class,
                SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, SequenceFileOutputFormat.class);

        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseSimilarity.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class,
                EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        asMatrix.setPartitionerClass(HashPartitioner.class);
        asMatrix.setGroupingComparatorClass(
                SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        asMatrix.waitForCompletion(true);
    }

    return 0;
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams//w w  w  . ja  v  a2  s.c o  m
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    job.waitForCompletion(true);

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:org.apache.mahout.vectorizer.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams//from   w  w  w  .  j av  a 2  s. com
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:org.apache.metron.pcap.mr.PcapJob.java

License:Apache License

public <T> Job createJob(Path basePath, Path outputPath, long beginNS, long endNS, int numReducers, T fields,
        Configuration conf, FileSystem fs, PcapFilterConfigurator<T> filterImpl) throws IOException {
    conf.set(START_TS_CONF, Long.toUnsignedString(beginNS));
    conf.set(END_TS_CONF, Long.toUnsignedString(endNS));
    conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers));
    filterImpl.addToConfig(fields, conf);
    Job job = Job.getInstance(conf);
    job.setJarByClass(PcapJob.class);
    job.setMapperClass(PcapJob.PcapMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(numReducers);//from   ww  w.jav a 2  s .c  o  m
    job.setReducerClass(PcapReducer.class);
    job.setPartitionerClass(PcapPartitioner.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    SequenceFileInputFormat.addInputPaths(job, Joiner.on(',').join(getPaths(fs, basePath, beginNS, endNS)));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);
    return job;

}

From source file:org.apache.mrql.GroupByJoinPlan.java

License:Apache License

/** the GroupByJoin operation:
 *      an equi-join combined with a group-by implemented using hashing
 * @param left_join_key_fnc   left join key function from a to k
 * @param right_join_key_fnc  right join key function from b to k
 * @param left_groupby_fnc    left group-by function from a to k1
 * @param right_groupby_fnc   right group-by function from b to k2
 * @param accumulator_fnc     accumulator function from (c,(a,b)) to c
 * @param zero                the left zero of accumulator of type c
 * @param reduce_fnc          reduce function from ((k1,k2),c) to d
 * @param X                   left data set of type {a}
 * @param Y                   right data set of type {b}
 * @param num_reducers        number of reducers
 * @param n                   left dimension of the reducer grid
 * @param m                   right dimension of the reducer grid
 * @param stop_counter        optional counter used in repeat operation
 * @return a DataSet that contains the result of type {d}
 *//*from w  w w .  java2 s .  c  o  m*/
public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function
        Tree right_join_key_fnc, // right join key function
        Tree left_groupby_fnc, // left group-by function
        Tree right_groupby_fnc, // right group-by function
        Tree accumulator_fnc, // accumulator function
        Tree zero, // the left zero of accumulator
        Tree reduce_fnc, // reduce function
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reducers, // number of reducers
        int n, int m, // dimensions of the reducer grid
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.join.key.left", left_join_key_fnc.toString());
    conf.set("mrql.join.key.right", right_join_key_fnc.toString());
    conf.set("mrql.groupby.left", left_groupby_fnc.toString());
    conf.set("mrql.groupby.right", right_groupby_fnc.toString());
    conf.setInt("mrql.m", m);
    conf.setInt("mrql.n", n);
    conf.set("mrql.accumulator", accumulator_fnc.toString());
    conf.set("mrql.zero", zero.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(GroupByJoinKey.class);
    job.setJarByClass(GroupByJoinPlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(GroupByJoinPartitioner.class);
    job.setSortComparatorClass(GroupByJoinSortComparator.class);
    job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    job.setReducerClass(JoinReducer.class);
    if (num_reducers > 0)
        job.setNumReduceTasks(num_reducers);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = false;
    return new DataSet(s, c, MapReducePlan.outputRecords(job));
}

From source file:org.apache.mrql.JoinOperation.java

License:Apache License

/** The MapReduce2 physical operator (a reduce-side join)
 * @param mx             left mapper function
 * @param my             right mapper function
 * @param combine_fnc    optional in-mapper combiner function
 * @param reduce_fnc     reducer function
 * @param acc_fnc        optional accumulator function
 * @param zero           optional the zero value for the accumulator
 * @param X              left data set//from  ww w. j a  v a2s . c  om
 * @param Y              right data set
 * @param num_reduces    number of reducers
 * @param stop_counter   optional counter used in repeat operation
 * @param orderp         does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce2(Tree mx, // left mapper function
        Tree my, // right mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper.left", mx.toString());
    conf.set("mrql.mapper.right", my.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(JoinKey.class);
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerJoinPartitioner.class);
    job.setSortComparatorClass(MRContainerSortComparator.class);
    job.setGroupingComparatorClass(MRContainerGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce2 reducer");
    job.setReducerClass(JoinReducer.class);
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.mrql.MapReduceOperation.java

License:Apache License

/**
 * The MapReduce physical operator//from  w ww.ja va2  s .  c o m
 * @param map_fnc          the mapper function
 * @param combine_fnc      optional in-mapper combiner function
 * @param reduce_fnc       the reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param source           the input data source
 * @param num_reduces      number of reducers
 * @param stop_counter     optional counter used in repeat operation
 * @param orderp           does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce(Tree map_fnc, // mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet source, // input data source
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper", map_fnc.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) { // will use in-mapper combiner
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(source, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerPartitioner.class);
    job.setSortComparatorClass(MRContainerKeyComparator.class);
    job.setGroupingComparatorClass(MRContainerKeyComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p : source.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setReducerClass(MRReducer.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce reducer");
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.phoenix.mapreduce.MultiHfileOutputFormat.java

License:Apache License

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
 * <code>splitPoints</code>. Cleans up the partitions file after job exists.
 */// w  w w.  j  a  v  a  2s  .  co m
static void configurePartitioner(Job job, Set<TableRowkeyPair> tablesStartKeys) throws IOException {

    Configuration conf = job.getConfiguration();
    // create the partitions file
    Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID());
    FileSystem fs = partitionsPath.getFileSystem(conf);
    fs.makeQualified(partitionsPath);
    writePartitions(conf, partitionsPath, tablesStartKeys);
    fs.deleteOnExit(partitionsPath);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
}

From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java

License:Apache License

private void setupSplitsFile(final Job job, final TableOperations parentTableOperations,
        final String parentTableName, final String childTableName) throws Exception {
    final FileSystem fs = FileSystem.get(conf);
    fs.setPermission(getPath(baseOutputDir, childTableName),
            new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    final Path splitsPath = getPath(baseOutputDir, childTableName, "splits.txt");
    final Collection<Text> splits = parentTableOperations.listSplits(parentTableName, 100);
    log.info("Creating splits file at: " + splitsPath);
    try (PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsPath)), false,
            StandardCharsets.UTF_8.name())) {
        for (final Text split : splits) {
            final String encoded = new String(Base64.encodeBase64(TextUtil.getBytes(split)),
                    StandardCharsets.UTF_8);
            out.println(encoded);/*from w  w w.  j a  v  a  2  s  .co m*/
        }
    }
    fs.setPermission(splitsPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));

    final String userDir = System.getProperty("user.dir");
    // The splits file has a symlink created in the user directory for some reason.
    // It might be better to copy the entire file for Windows but it doesn't seem to matter if
    // the user directory symlink is broken.
    java.nio.file.Files.deleteIfExists(new File(userDir, "splits.txt").toPath());
    //Files.copy(new File(splitsPath.toString()), new File(userDir, "splits.txt"));
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsPath.toString());
    job.setNumReduceTasks(splits.size() + 1);
}

From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java

License:Apache License

public static int runIndexingPipeline(Job job, JobProcessCallback callback, Configuration conf, Options options,
        long programStartTime, FileSystem fs, Path fullInputList, long numFiles, int realMappers, int reducers)
        throws IOException, KeeperException, InterruptedException, ClassNotFoundException,
        FileNotFoundException {//from   w w w. ja v a  2s .com
    long startTime;
    float secs;

    Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR);
    Path outputReduceDir = new Path(options.outputDir, "reducers");
    Path outputTreeMergeStep = new Path(options.outputDir, "mtree-merge-output");

    FileOutputFormat.setOutputPath(job, outputReduceDir);

    if (job.getConfiguration().get(JobContext.REDUCE_CLASS_ATTR) == null) { // enable customization
        job.setReducerClass(SolrReducer.class);
    }
    if (options.updateConflictResolver == null) {
        throw new IllegalArgumentException("updateConflictResolver must not be null");
    }
    job.getConfiguration().set(SolrReducer.UPDATE_CONFLICT_RESOLVER, options.updateConflictResolver);
    job.getConfiguration().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments);

    if (options.zkHost != null) {
        assert options.collection != null;
        /*
         * MapReduce partitioner that partitions the Mapper output such that each
         * SolrInputDocument gets sent to the SolrCloud shard that it would have
         * been sent to if the document were ingested via the standard SolrCloud
         * Near Real Time (NRT) API.
         * 
         * In other words, this class implements the same partitioning semantics
         * as the standard SolrCloud NRT API. This enables to mix batch updates
         * from MapReduce ingestion with updates from standard NRT ingestion on
         * the same SolrCloud cluster, using identical unique document keys.
         */
        if (job.getConfiguration().get(JobContext.PARTITIONER_CLASS_ATTR) == null) { // enable customization
            job.setPartitionerClass(ForkedSolrCloudPartitioner.class);
        }
        job.getConfiguration().set(ForkedSolrCloudPartitioner.ZKHOST, options.zkHost);
        job.getConfiguration().set(ForkedSolrCloudPartitioner.COLLECTION, options.collection);
    }
    job.getConfiguration().setInt(ForkedSolrCloudPartitioner.SHARDS, options.shards);

    job.setOutputFormatClass(SolrOutputFormat.class);
    if (options.solrHomeDir != null) {
        SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job);
    } else {
        assert options.zkHost != null;
        // use the config that this collection uses for the SolrHomeCache.
        ForkedZooKeeperInspector zki = new ForkedZooKeeperInspector();
        SolrZkClient zkClient = zki.getZkClient(options.zkHost);
        try {
            String configName = zki.readConfigName(zkClient, options.collection);
            File tmpSolrHomeDir = zki.downloadConfigDir(zkClient, configName);
            SolrOutputFormat.setupSolrHomeCache(tmpSolrHomeDir, job);
            LOG.debug("Using " + tmpSolrHomeDir + " as solr home");
            options.solrHomeDir = tmpSolrHomeDir;
        } finally {
            zkClient.close();
        }
    }

    //    MorphlineMapRunner runner = setupMorphline(job, options);
    //    if (options.isDryRun && runner != null) {
    //      LOG.info("Indexing {} files in dryrun mode", numFiles);
    //      startTime = System.currentTimeMillis();
    //      dryRun(job, runner, fs, fullInputList);
    //      secs = (System.currentTimeMillis() - startTime) / 1000.0f;
    //      LOG.info("Done. Indexing {} files in dryrun mode took {} secs", numFiles, secs);
    //      goodbye(null, programStartTime);
    //      return 0;
    //    }
    //    job.getConfiguration().set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, options.morphlineFile.getName());

    job.setNumReduceTasks(reducers);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SolrInputDocumentWritable.class);
    LOG.info("Indexing data into {} reducers", new Object[] { reducers });
    startTime = System.currentTimeMillis();
    job.submit();
    callback.jobStarted(job.getJobID().toString(), job.getTrackingURL());
    if (!waitForCompletion(job, options.isVerbose)) {
        return -1; // job failed
    }

    secs = (System.currentTimeMillis() - startTime) / 1000.0f;
    LOG.info("Done. Indexing data into {} reducers took {} secs", new Object[] { reducers, secs });

    int mtreeMergeIterations = 0;
    if (reducers > options.shards) {
        mtreeMergeIterations = (int) Math.round(log(options.fanout, reducers / options.shards));
    }
    LOG.debug("MTree merge iterations to do: {}", mtreeMergeIterations);
    int mtreeMergeIteration = 1;
    while (reducers > options.shards) { // run a mtree merge iteration
        job = Job.getInstance(conf);
        job.setJarByClass(ForkedMapReduceIndexerTool.class);
        job.setJobName(ForkedMapReduceIndexerTool.class.getName() + "/"
                + Utils.getShortClassName(ForkedTreeMergeMapper.class));
        job.setMapperClass(ForkedTreeMergeMapper.class);
        job.setOutputFormatClass(ForkedTreeMergeOutputFormat.class);
        job.setNumReduceTasks(0);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setInputFormatClass(NLineInputFormat.class);

        Path inputStepDir = new Path(options.outputDir, "mtree-merge-input-iteration" + mtreeMergeIteration);
        fullInputList = new Path(inputStepDir, FULL_INPUT_LIST);
        LOG.debug("MTree merge iteration {}/{}: Creating input list file for mappers {}",
                new Object[] { mtreeMergeIteration, mtreeMergeIterations, fullInputList });
        numFiles = createTreeMergeInputDirList(job, outputReduceDir, fs, fullInputList);
        if (numFiles != reducers) {
            throw new IllegalStateException("Not same reducers: " + reducers + ", numFiles: " + numFiles);
        }
        NLineInputFormat.addInputPath(job, fullInputList);
        NLineInputFormat.setNumLinesPerSplit(job, options.fanout);
        FileOutputFormat.setOutputPath(job, outputTreeMergeStep);

        LOG.info("MTree merge iteration {}/{}: Merging {} shards into {} shards using fanout {}",
                new Object[] { mtreeMergeIteration, mtreeMergeIterations, reducers, (reducers / options.fanout),
                        options.fanout });
        startTime = System.currentTimeMillis();
        job.submit();
        callback.jobStarted(job.getJobID().toString(), job.getTrackingURL());
        if (!waitForCompletion(job, options.isVerbose)) {
            return -1; // job failed
        }
        if (!renameTreeMergeShardDirs(outputTreeMergeStep, job, fs)) {
            return -1;
        }
        secs = (System.currentTimeMillis() - startTime) / 1000.0f;
        LOG.info(
                "MTree merge iteration {}/{}: Done. Merging {} shards into {} shards using fanout {} took {} secs",
                new Object[] { mtreeMergeIteration, mtreeMergeIterations, reducers, (reducers / options.fanout),
                        options.fanout, secs });

        if (!delete(outputReduceDir, true, fs)) {
            return -1;
        }
        if (!rename(outputTreeMergeStep, outputReduceDir, fs)) {
            return -1;
        }
        assert reducers % options.fanout == 0;
        reducers = reducers / options.fanout;
        mtreeMergeIteration++;
    }
    assert reducers == options.shards;

    // normalize output shard dir prefix, i.e.
    // rename part-r-00000 to part-00000 (stems from zero tree merge iterations)
    // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations)
    for (FileStatus stats : fs.listStatus(outputReduceDir)) {
        String dirPrefix = SolrOutputFormat.getOutputName(job);
        Path srcPath = stats.getPath();
        if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) {
            String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length());
            Path dstPath = new Path(srcPath.getParent(), dstName);
            if (!rename(srcPath, dstPath, fs)) {
                return -1;
            }
        }
    }
    ;

    // publish results dir
    if (!rename(outputReduceDir, outputResultsDir, fs)) {
        return -1;
    }

    if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) {
        return -1;
    }

    goodbye(job, programStartTime);
    return 0;
}