List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();//from w ww. j av a 2 s . c o m addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix"); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns")); String similarityClassnameArg = parsedArgs.get("--similarityClassname"); String distributedSimilarityClassname; try { distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg) .getSimilarityImplementationClassName(); } catch (IllegalArgumentException iae) { distributedSimilarityClassname = similarityClassnameArg; } int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path weightsPath = new Path(tempDirPath, "weights"); Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class, VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class, VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class); weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); weights.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class, CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, SequenceFileOutputFormat.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseSimilarity.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class, Mapper.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); asMatrix.setPartitionerClass(HashPartitioner.class); asMatrix.setGroupingComparatorClass( SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); asMatrix.waitForCompletion(true); } return 0; }
From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams//w w w . ja v a2 s.c o m */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); job.waitForCompletion(true); return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:org.apache.mahout.vectorizer.collocations.llr.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams//from w w w . j av a 2 s. com */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:org.apache.metron.pcap.mr.PcapJob.java
License:Apache License
public <T> Job createJob(Path basePath, Path outputPath, long beginNS, long endNS, int numReducers, T fields, Configuration conf, FileSystem fs, PcapFilterConfigurator<T> filterImpl) throws IOException { conf.set(START_TS_CONF, Long.toUnsignedString(beginNS)); conf.set(END_TS_CONF, Long.toUnsignedString(endNS)); conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers)); filterImpl.addToConfig(fields, conf); Job job = Job.getInstance(conf); job.setJarByClass(PcapJob.class); job.setMapperClass(PcapJob.PcapMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BytesWritable.class); job.setNumReduceTasks(numReducers);//from ww w.jav a 2 s .c o m job.setReducerClass(PcapReducer.class); job.setPartitionerClass(PcapPartitioner.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BytesWritable.class); SequenceFileInputFormat.addInputPaths(job, Joiner.on(',').join(getPaths(fs, basePath, beginNS, endNS))); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); return job; }
From source file:org.apache.mrql.GroupByJoinPlan.java
License:Apache License
/** the GroupByJoin operation: * an equi-join combined with a group-by implemented using hashing * @param left_join_key_fnc left join key function from a to k * @param right_join_key_fnc right join key function from b to k * @param left_groupby_fnc left group-by function from a to k1 * @param right_groupby_fnc right group-by function from b to k2 * @param accumulator_fnc accumulator function from (c,(a,b)) to c * @param zero the left zero of accumulator of type c * @param reduce_fnc reduce function from ((k1,k2),c) to d * @param X left data set of type {a} * @param Y right data set of type {b} * @param num_reducers number of reducers * @param n left dimension of the reducer grid * @param m right dimension of the reducer grid * @param stop_counter optional counter used in repeat operation * @return a DataSet that contains the result of type {d} *//*from w w w . java2 s . c o m*/ public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function Tree right_join_key_fnc, // right join key function Tree left_groupby_fnc, // left group-by function Tree right_groupby_fnc, // right group-by function Tree accumulator_fnc, // accumulator function Tree zero, // the left zero of accumulator Tree reduce_fnc, // reduce function DataSet X, // left data set DataSet Y, // right data set int num_reducers, // number of reducers int n, int m, // dimensions of the reducer grid String stop_counter) // optional counter used in repeat operation throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.join.key.left", left_join_key_fnc.toString()); conf.set("mrql.join.key.right", right_join_key_fnc.toString()); conf.set("mrql.groupby.left", left_groupby_fnc.toString()); conf.set("mrql.groupby.right", right_groupby_fnc.toString()); conf.setInt("mrql.m", m); conf.setInt("mrql.n", n); conf.set("mrql.accumulator", accumulator_fnc.toString()); conf.set("mrql.zero", zero.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(GroupByJoinKey.class); job.setJarByClass(GroupByJoinPlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(GroupByJoinPartitioner.class); job.setSortComparatorClass(GroupByJoinSortComparator.class); job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); job.setReducerClass(JoinReducer.class); if (num_reducers > 0) job.setNumReduceTasks(num_reducers); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = false; return new DataSet(s, c, MapReducePlan.outputRecords(job)); }
From source file:org.apache.mrql.JoinOperation.java
License:Apache License
/** The MapReduce2 physical operator (a reduce-side join) * @param mx left mapper function * @param my right mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X left data set//from ww w. j a v a2s . c om * @param Y right data set * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce2(Tree mx, // left mapper function Tree my, // right mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // left data set DataSet Y, // right data set int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper.left", mx.toString()); conf.set("mrql.mapper.right", my.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(JoinKey.class); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerJoinPartitioner.class); job.setSortComparatorClass(MRContainerSortComparator.class); job.setGroupingComparatorClass(MRContainerGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc)) System.out.println("Streamed MapReduce2 reducer"); job.setReducerClass(JoinReducer.class); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.mrql.MapReduceOperation.java
License:Apache License
/** * The MapReduce physical operator//from w ww.ja va2 s . c o m * @param map_fnc the mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc the reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param source the input data source * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce(Tree map_fnc, // mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet source, // input data source int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper", map_fnc.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { // will use in-mapper combiner conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(source, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerPartitioner.class); job.setSortComparatorClass(MRContainerKeyComparator.class); job.setGroupingComparatorClass(MRContainerKeyComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); for (DataSource p : source.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setReducerClass(MRReducer.class); if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc)) System.out.println("Streamed MapReduce reducer"); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.phoenix.mapreduce.MultiHfileOutputFormat.java
License:Apache License
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. */// w w w. j a v a 2s . co m static void configurePartitioner(Job job, Set<TableRowkeyPair> tablesStartKeys) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file Path partitionsPath = new Path(conf.get("hadoop.tmp.dir"), "partitions_" + UUID.randomUUID()); FileSystem fs = partitionsPath.getFileSystem(conf); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, tablesStartKeys); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java
License:Apache License
private void setupSplitsFile(final Job job, final TableOperations parentTableOperations, final String parentTableName, final String childTableName) throws Exception { final FileSystem fs = FileSystem.get(conf); fs.setPermission(getPath(baseOutputDir, childTableName), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); final Path splitsPath = getPath(baseOutputDir, childTableName, "splits.txt"); final Collection<Text> splits = parentTableOperations.listSplits(parentTableName, 100); log.info("Creating splits file at: " + splitsPath); try (PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsPath)), false, StandardCharsets.UTF_8.name())) { for (final Text split : splits) { final String encoded = new String(Base64.encodeBase64(TextUtil.getBytes(split)), StandardCharsets.UTF_8); out.println(encoded);/*from w w w. j a v a 2 s .co m*/ } } fs.setPermission(splitsPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); final String userDir = System.getProperty("user.dir"); // The splits file has a symlink created in the user directory for some reason. // It might be better to copy the entire file for Windows but it doesn't seem to matter if // the user directory symlink is broken. java.nio.file.Files.deleteIfExists(new File(userDir, "splits.txt").toPath()); //Files.copy(new File(splitsPath.toString()), new File(userDir, "splits.txt")); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsPath.toString()); job.setNumReduceTasks(splits.size() + 1); }
From source file:org.apache.solr.hadoop.ForkedMapReduceIndexerTool.java
License:Apache License
public static int runIndexingPipeline(Job job, JobProcessCallback callback, Configuration conf, Options options, long programStartTime, FileSystem fs, Path fullInputList, long numFiles, int realMappers, int reducers) throws IOException, KeeperException, InterruptedException, ClassNotFoundException, FileNotFoundException {//from w w w. ja v a 2s .com long startTime; float secs; Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR); Path outputReduceDir = new Path(options.outputDir, "reducers"); Path outputTreeMergeStep = new Path(options.outputDir, "mtree-merge-output"); FileOutputFormat.setOutputPath(job, outputReduceDir); if (job.getConfiguration().get(JobContext.REDUCE_CLASS_ATTR) == null) { // enable customization job.setReducerClass(SolrReducer.class); } if (options.updateConflictResolver == null) { throw new IllegalArgumentException("updateConflictResolver must not be null"); } job.getConfiguration().set(SolrReducer.UPDATE_CONFLICT_RESOLVER, options.updateConflictResolver); job.getConfiguration().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments); if (options.zkHost != null) { assert options.collection != null; /* * MapReduce partitioner that partitions the Mapper output such that each * SolrInputDocument gets sent to the SolrCloud shard that it would have * been sent to if the document were ingested via the standard SolrCloud * Near Real Time (NRT) API. * * In other words, this class implements the same partitioning semantics * as the standard SolrCloud NRT API. This enables to mix batch updates * from MapReduce ingestion with updates from standard NRT ingestion on * the same SolrCloud cluster, using identical unique document keys. */ if (job.getConfiguration().get(JobContext.PARTITIONER_CLASS_ATTR) == null) { // enable customization job.setPartitionerClass(ForkedSolrCloudPartitioner.class); } job.getConfiguration().set(ForkedSolrCloudPartitioner.ZKHOST, options.zkHost); job.getConfiguration().set(ForkedSolrCloudPartitioner.COLLECTION, options.collection); } job.getConfiguration().setInt(ForkedSolrCloudPartitioner.SHARDS, options.shards); job.setOutputFormatClass(SolrOutputFormat.class); if (options.solrHomeDir != null) { SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job); } else { assert options.zkHost != null; // use the config that this collection uses for the SolrHomeCache. ForkedZooKeeperInspector zki = new ForkedZooKeeperInspector(); SolrZkClient zkClient = zki.getZkClient(options.zkHost); try { String configName = zki.readConfigName(zkClient, options.collection); File tmpSolrHomeDir = zki.downloadConfigDir(zkClient, configName); SolrOutputFormat.setupSolrHomeCache(tmpSolrHomeDir, job); LOG.debug("Using " + tmpSolrHomeDir + " as solr home"); options.solrHomeDir = tmpSolrHomeDir; } finally { zkClient.close(); } } // MorphlineMapRunner runner = setupMorphline(job, options); // if (options.isDryRun && runner != null) { // LOG.info("Indexing {} files in dryrun mode", numFiles); // startTime = System.currentTimeMillis(); // dryRun(job, runner, fs, fullInputList); // secs = (System.currentTimeMillis() - startTime) / 1000.0f; // LOG.info("Done. Indexing {} files in dryrun mode took {} secs", numFiles, secs); // goodbye(null, programStartTime); // return 0; // } // job.getConfiguration().set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, options.morphlineFile.getName()); job.setNumReduceTasks(reducers); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SolrInputDocumentWritable.class); LOG.info("Indexing data into {} reducers", new Object[] { reducers }); startTime = System.currentTimeMillis(); job.submit(); callback.jobStarted(job.getJobID().toString(), job.getTrackingURL()); if (!waitForCompletion(job, options.isVerbose)) { return -1; // job failed } secs = (System.currentTimeMillis() - startTime) / 1000.0f; LOG.info("Done. Indexing data into {} reducers took {} secs", new Object[] { reducers, secs }); int mtreeMergeIterations = 0; if (reducers > options.shards) { mtreeMergeIterations = (int) Math.round(log(options.fanout, reducers / options.shards)); } LOG.debug("MTree merge iterations to do: {}", mtreeMergeIterations); int mtreeMergeIteration = 1; while (reducers > options.shards) { // run a mtree merge iteration job = Job.getInstance(conf); job.setJarByClass(ForkedMapReduceIndexerTool.class); job.setJobName(ForkedMapReduceIndexerTool.class.getName() + "/" + Utils.getShortClassName(ForkedTreeMergeMapper.class)); job.setMapperClass(ForkedTreeMergeMapper.class); job.setOutputFormatClass(ForkedTreeMergeOutputFormat.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(NLineInputFormat.class); Path inputStepDir = new Path(options.outputDir, "mtree-merge-input-iteration" + mtreeMergeIteration); fullInputList = new Path(inputStepDir, FULL_INPUT_LIST); LOG.debug("MTree merge iteration {}/{}: Creating input list file for mappers {}", new Object[] { mtreeMergeIteration, mtreeMergeIterations, fullInputList }); numFiles = createTreeMergeInputDirList(job, outputReduceDir, fs, fullInputList); if (numFiles != reducers) { throw new IllegalStateException("Not same reducers: " + reducers + ", numFiles: " + numFiles); } NLineInputFormat.addInputPath(job, fullInputList); NLineInputFormat.setNumLinesPerSplit(job, options.fanout); FileOutputFormat.setOutputPath(job, outputTreeMergeStep); LOG.info("MTree merge iteration {}/{}: Merging {} shards into {} shards using fanout {}", new Object[] { mtreeMergeIteration, mtreeMergeIterations, reducers, (reducers / options.fanout), options.fanout }); startTime = System.currentTimeMillis(); job.submit(); callback.jobStarted(job.getJobID().toString(), job.getTrackingURL()); if (!waitForCompletion(job, options.isVerbose)) { return -1; // job failed } if (!renameTreeMergeShardDirs(outputTreeMergeStep, job, fs)) { return -1; } secs = (System.currentTimeMillis() - startTime) / 1000.0f; LOG.info( "MTree merge iteration {}/{}: Done. Merging {} shards into {} shards using fanout {} took {} secs", new Object[] { mtreeMergeIteration, mtreeMergeIterations, reducers, (reducers / options.fanout), options.fanout, secs }); if (!delete(outputReduceDir, true, fs)) { return -1; } if (!rename(outputTreeMergeStep, outputReduceDir, fs)) { return -1; } assert reducers % options.fanout == 0; reducers = reducers / options.fanout; mtreeMergeIteration++; } assert reducers == options.shards; // normalize output shard dir prefix, i.e. // rename part-r-00000 to part-00000 (stems from zero tree merge iterations) // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations) for (FileStatus stats : fs.listStatus(outputReduceDir)) { String dirPrefix = SolrOutputFormat.getOutputName(job); Path srcPath = stats.getPath(); if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) { String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length()); Path dstPath = new Path(srcPath.getParent(), dstName); if (!rename(srcPath, dstPath, fs)) { return -1; } } } ; // publish results dir if (!rename(outputReduceDir, outputResultsDir, fs)) { return -1; } if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) { return -1; } goodbye(job, programStartTime); return 0; }