List of usage examples for org.apache.hadoop.mapred JobConf setOutputValueGroupingComparator
public void setOutputValueGroupingComparator(Class<? extends RawComparator> theClass)
From source file:NaivePageRank.java
License:Apache License
public static void main(String[] args) throws Exception { int iteration = -1; String inputPath = args[0];//www. j av a 2 s . com String outputPath = args[1]; int specIteration = 0; if (args.length > 2) { specIteration = Integer.parseInt(args[2]); } int numNodes = 100000; if (args.length > 3) { numNodes = Integer.parseInt(args[3]); } int numReducers = 32; if (args.length > 4) { numReducers = Integer.parseInt(args[4]); } System.out.println("specified iteration: " + specIteration); long start = System.currentTimeMillis(); /** * job to count out-going links for each url */ JobConf conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Count"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CountMapper.class); conf.setReducerClass(CountReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/count")); conf.setNumReduceTasks(numReducers); JobClient.runJob(conf); /******************** Initial Rank Assignment Job ***********************/ conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Initialize"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(InitialRankAssignmentMapper.class); conf.setReducerClass(InitialRankAssignmentReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration)); conf.setNumReduceTasks(numReducers); // conf.setIterative(false); JobClient.runJob(conf); iteration++; do { /****************** Join Job ********************************/ conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Join"); conf.setOutputKeyClass(Text.class); // conf.setOutputValueClass(Text.class); conf.setMapperClass(ComputeRankMap.class); conf.setReducerClass(ComputeRankReduce.class); conf.setMapOutputKeyClass(TextPair.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setPartitionerClass(FirstPartitioner.class); conf.setOutputKeyComparatorClass(KeyComparator.class); conf.setOutputValueGroupingComparator(GroupComparator.class); // relation table FileInputFormat.setInputPaths(conf, new Path(inputPath)); // rank table FileInputFormat.addInputPath(conf, new Path(outputPath + "/i" + (iteration - 1))); // count table FileInputFormat.addInputPath(conf, new Path(outputPath + "/count")); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration)); conf.setNumReduceTasks(numReducers); JobClient.runJob(conf); iteration++; /******************** Rank Aggregate Job ***********************/ conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Aggregate"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapperClass(RankAggregateMapper.class); conf.setReducerClass(RankAggregateReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(outputPath + "/i" + (iteration - 1))); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration)); conf.setNumReduceTasks(numReducers); conf.setInt("haloop.num.nodes", numNodes); JobClient.runJob(conf); iteration++; } while (iteration < 2 * specIteration); long end = System.currentTimeMillis(); System.out.println("running time " + (end - start) / 1000 + "s"); }
From source file:cascading.flow.FlowStep.java
License:Open Source License
protected JobConf getJobConf(JobConf parentConf) throws IOException { JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf); // set values first so they can't break things downstream if (hasProperties()) { for (Map.Entry entry : getProperties().entrySet()) conf.set(entry.getKey().toString(), entry.getValue().toString()); }/*from w w w . j a v a 2s .c o m*/ // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepName()); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapperClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(conf); initFromSink(conf); initFromTraps(conf); if (sink.getScheme().getNumSinkParts() != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(sink.getScheme().getNumSinkParts()); else conf.setNumMapTasks(sink.getScheme().getNumSinkParts()); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks conf.setInt("cascading.flow.step.id", id); conf.set("cascading.flow.step", Util.serializeBase64(this)); return conf; }
From source file:cascading.flow.hadoop.HadoopFlowStep.java
License:Open Source License
public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) { JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig); // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH))); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapRunnerClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(flowProcess, conf);/*from www .jav a 2 s .c o m*/ initFromSink(flowProcess, conf); initFromTraps(flowProcess, conf); initFromStepConfigDef(conf); int numSinkParts = getSink().getScheme().getNumSinkParts(); if (numSinkParts != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(numSinkParts); else conf.setNumMapTasks(numSinkParts); } else if (getGroup() != null) { int gatherPartitions = conf.getNumReduceTasks(); if (gatherPartitions == 0) gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0); if (gatherPartitions == 0) throw new FlowException(getName(), "a default number of gather partitions must be set, see FlowRuntimeProps"); conf.setNumReduceTasks(gatherPartitions); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); conf.setPartitionerClass(GroupingPartitioner.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this, getGroup()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingSortingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks String versionString = Version.getRelease(); if (versionString != null) conf.set("cascading.version", versionString); conf.set(CASCADING_FLOW_STEP_ID, getID()); conf.set("cascading.flow.step.num", Integer.toString(getOrdinal())); HadoopUtil.setIsInflow(conf); Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator(); String mapState = pack(iterator.next(), conf); String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf); // hadoop 20.2 doesn't like dist cache when using local mode int maxSize = Short.MAX_VALUE; int length = mapState.length() + reduceState.length(); if (isHadoopLocalMode(conf) || length < maxSize) // seems safe { conf.set("cascading.flow.step.node.map", mapState); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce", reduceState); } else { conf.set("cascading.flow.step.node.map.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState)); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState)); } return conf; }
From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java
License:Apache License
public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: urlresolution <url-map> <url-register> <out>"); System.exit(2);/*from w w w . j a v a2s . c om*/ } JobConf job = new JobConf(conf); FileSystem fS = FileSystem.get(conf); fS.delete(new Path(otherArgs[2]), true); MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class); MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class); job.setJarByClass(HadoopUrlResolution.class); job.setPartitionerClass(KeyPartitioner.class); job.setOutputValueGroupingComparator(GroupingComparator.class); job.setMapOutputKeyClass(UrlRegJoinUrlMap.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); Job j = new Job(job); j.setReducerClass(Reduce.class); j.waitForCompletion(true); }
From source file:com.hadoop.secondarysort.SecondarySort_MapRed.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2);//from www. j a v a 2 s . co m } JobConf jobConf = new JobConf(conf); jobConf.setMapperClass(MapClass.class); jobConf.setReducerClass(Reduce.class); jobConf.setPartitionerClass(FirstPartitioner.class); jobConf.setOutputValueGroupingComparator(FirstGroupingComparator.class); jobConf.setMapOutputKeyClass(IntPair.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(IntWritable.class); // // Job job = new Job(conf, "secondary sort"); // job.setJarByClass(SecondarySort_MapRed.class); // job.setMapperClass(MapClass.class); // job.setReducerClass(Reduce.class); // // // group and partition by the first int in the pair // job.setPartitionerClass(FirstPartitioner.class); // job.setGroupingComparatorClass(FirstGroupingComparator.class); // conf.setClass("mapred.output.key.comparator.class", // KeyComparator.class, RawComparator.class); // // job.setSortComparatorClass(SecondGroupingComparator.class); // // the map output is IntPair, IntWritable // job.setMapOutputKeyClass(IntPair.class); // job.setMapOutputValueClass(IntWritable.class); // // // the reduce output is Text, IntWritable // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(jobConf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs[1])); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java
License:Open Source License
/** * /*from w w w . jav a 2 s . c o m*/ * @param fname null if no comparison required * @param fnameNew * @param srcFnames * @param ii * @param oi * @param rlen * @param clen * @param brlen * @param bclen * @throws DMLRuntimeException */ @SuppressWarnings({ "unused", "deprecation" }) protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-RMMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(ResultMergeRemoteMR.class); job.setJobName(jobname + _pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); //warning for textcell/binarycell without compare boolean withCompare = (fname != null); if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES) LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi) + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR."); try { Path pathCompare = null; Path pathNew = new Path(fnameNew); ///// //configure the MR job if (withCompare) { pathCompare = new Path(fname).makeQualified(FileSystem.get(job)); MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen, bclen); } else MRJobConfiguration.setResultMergeInfo(job, "null", ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen, bclen); //set mappers, reducers, combiners job.setMapperClass(ResultMergeRemoteMapper.class); job.setReducerClass(ResultMergeRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { //setup partitioning, grouping, sorting for composite key (old API) job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path Path[] paths = null; if (withCompare) { paths = new Path[srcFnames.length + 1]; paths[0] = pathCompare; for (int i = 1; i < paths.length; i++) paths[i] = new Path(srcFnames[i - 1]); } else { paths = new Path[srcFnames.length]; for (int i = 0; i < paths.length; i++) paths[i] = new Path(srcFnames[i]); } FileInputFormat.setInputPaths(job, paths); //set output format job.setOutputFormat(oi.outputFormatClass); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); FileOutputFormat.setOutputPath(job, pathNew); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = _numReducers; if (oi == OutputInfo.BinaryBlockOutputInfo) reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1); else //textcell/binarycell reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1); job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.map.min", 0); job.setInt("flex.map.max", _numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", _numMappers); } //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set("mapred.compress.map.output", "true"); //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt("dfs.replication", _replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:com.manning.hip.ch4.joins.improved.impl.OptimizedDataJoinJob.java
License:Apache License
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0];//from w ww. ja v a2 s . c o m String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, OptimizedDataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir)); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setPartitionerClass(CompositeKeyPartitioner.class); job.setOutputKeyComparatorClass(CompositeKeyComparator.class); job.setOutputValueGroupingComparator(CompositeKeyOnlyComparator.class); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, "<ncdc input> <station input> <output>"); return -1; }// w w w.ja v a2s.com JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Join record with station name"); Path ncdcInputPath = new Path(args[0]); Path stationInputPath = new Path(args[1]); Path outputPath = new Path(args[2]); MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class); MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class); FileOutputFormat.setOutputPath(conf, outputPath); /*[*/conf.setPartitionerClass(KeyPartitioner.class); conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);/*]*/ conf.setMapOutputKeyClass(TextPair.class); conf.setReducerClass(JoinReducer.class); conf.setOutputKeyClass(Text.class); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; }/*from w w w . j ava 2 s.c om*/ conf.setMapperClass(MaxTemperatureMapper.class); /*[*/conf.setPartitionerClass(FirstPartitioner.class); /*]*/ /*[*/conf.setOutputKeyComparatorClass(KeyComparator.class); /*]*/ /*[*/conf.setOutputValueGroupingComparator(GroupComparator.class);/*]*/ conf.setReducerClass(MaxTemperatureReducer.class); conf.setOutputKeyClass(IntPair.class); conf.setOutputValueClass(NullWritable.class); JobClient.runJob(conf); return 0; }
From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args//from w w w. j a v a 2 s . c o m * @throws Exception */ public int run(String[] args) throws Exception { long time = System.currentTimeMillis(); // For the moment: Hard-code the terrier home to quick test System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer"); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return 0; } else if (args.length == 0) { logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } /*else { logger.fatal(usage()); return 0; }*/ if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return 0; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJarByClass(TerrierIndexing.class); conf.setJobName("StreamCorpusIndexer: Terrier Indexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return 0; } // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; boolean blockIndexing = true; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); // not sure if this is effective in YARN conf.setNumMapTasks(2000); // increase the heap usage conf.set("mapreduce.map.memory.mb", "6100"); conf.set("mapred.job.map.memory.mb", "6100"); conf.set("mapreduce.reduce.memory.mb", "6144"); conf.set("mapred.job.reduce.memory.mb", "6144"); conf.set("mapreduce.map.java.opts", "-Xmx6100m"); conf.set("mapred.map.child.java.opts", "-Xmx6100m"); conf.set("mapreduce.reduce.java.opts", "-Xmx6144m"); conf.set("mapred.reduce.child.opts", "-Xmx6144m"); //conf.setBoolean("mapred.used.genericoptionsparser", true) ; // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it conf.set("mapreduce.job.user.classpath.first", "true"); // increase the yarn memory to 10 GB conf.set("yarn.nodemanager.resource.memory-mb", "12288"); conf.set("yarn.nodemanager.resource.cpu-vcores", "16"); conf.set("yarn.scheduler.minimum-allocation-mb", "4096"); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } /*JobID jobId = null; boolean ranOK = true; try{ RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); e.printStackTrace(); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } */ //if (ranOK) //{ System.out.println("Merging indices"); if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); //} System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); return 0; }