List of usage examples for org.apache.hadoop.mapreduce Job setGroupingComparatorClass
public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:org.apache.mahout.cf.taste.hadoop.als.PredictionJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOption("pairs", "p", "path containing the test ratings, each line must be: userID,itemID", true); addOption("userFeatures", "u", "path to the user feature matrix", true); addOption("itemFeatures", "i", "path to the item feature matrix", true); addOutputOption();// ww w .j a v a 2s .c o m Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path pairs = new Path(parsedArgs.get("--pairs")); Path userFeatures = new Path(parsedArgs.get("--userFeatures")); Path itemFeatures = new Path(parsedArgs.get("--itemFeatures")); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path convertedPairs = new Path(tempDirPath, "convertedPairs"); Path convertedUserFeatures = new Path(tempDirPath, "convertedUserFeatures"); Path convertedItemFeatures = new Path(tempDirPath, "convertedItemFeatures"); Path pairsJoinedWithItemFeatures = new Path(tempDirPath, "pairsJoinedWithItemFeatures"); Job convertPairs = prepareJob(pairs, convertedPairs, TextInputFormat.class, PairsMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); convertPairs.waitForCompletion(true); Job convertUserFeatures = prepareJob(userFeatures, convertedUserFeatures, SequenceFileInputFormat.class, FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); convertUserFeatures.waitForCompletion(true); Job convertItemFeatures = prepareJob(itemFeatures, convertedItemFeatures, SequenceFileInputFormat.class, FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); convertItemFeatures.waitForCompletion(true); Job joinPairsWithItemFeatures = prepareJob(new Path(convertedPairs + "," + convertedItemFeatures), pairsJoinedWithItemFeatures, SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, JoinProbesWithItemFeaturesReducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); joinPairsWithItemFeatures.setPartitionerClass(HashPartitioner.class); joinPairsWithItemFeatures.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class); joinPairsWithItemFeatures.waitForCompletion(true); Job predictRatings = prepareJob(new Path(pairsJoinedWithItemFeatures + "," + convertedUserFeatures), getOutputPath(), SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, PredictRatingReducer.class, Text.class, NullWritable.class, TextOutputFormat.class); predictRatings.setPartitionerClass(HashPartitioner.class); predictRatings.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class); predictRatings.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.graph.common.EnumerateTrianglesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w . ja va2 s. com*/ addOutputOption(); addOption("text", "t", "output in textformat?", String.valueOf(Boolean.FALSE)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Class<? extends FileOutputFormat> outputFormat = Boolean.parseBoolean(parsedArgs.get("--text")) ? TextOutputFormat.class : SequenceFileOutputFormat.class; /* scatter the edges to each of the vertices and count degree */ Job scatter = prepareJob(getInputPath(), getTempPath(TMP_AUGMENTED_EDGES), ScatterEdgesMapper.class, Vertex.class, Vertex.class, SumDegreesReducer.class, UndirectedEdge.class, VertexWithDegree.class); scatter.waitForCompletion(true); /* join augmented edges with partial degree information to to complete records */ Job join = prepareJob(getTempPath(TMP_AUGMENTED_EDGES), getTempPath(TMP_EDGES_WITH_DEGREES), Mapper.class, UndirectedEdge.class, VertexWithDegree.class, JoinDegreesReducer.class, UndirectedEdgeWithDegrees.class, NullWritable.class); join.waitForCompletion(true); /* scatter the edges to lower degree vertex and build open triads */ Job scatterToLower = prepareJob(getTempPath(TMP_EDGES_WITH_DEGREES), getTempPath(TMP_OPEN_TRIADS), ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class, BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class); scatterToLower.waitForCompletion(true); /* necessary as long as we don't have access to an undeprecated MultipleInputs */ Job prepareInput = prepareJob(getTempPath(TMP_EDGES_WITH_DEGREES), getTempPath(TMP_CLOSING_EDGES), PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class); prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class); prepareInput.waitForCompletion(true); /* join opentriads and edges pairwise to get all triangles */ Job joinTriads = prepareJob(getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(), SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, JoinTrianglesReducer.class, Triangle.class, NullWritable.class, outputFormat); joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class); joinTriads.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.graph.components.FindComponentsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w. j ava2 s . c o m*/ addOutputOption(); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); AtomicInteger currentPhase = new AtomicInteger(); Path edgesPath = inputPath; Path zoneAssignmentsPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis())); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Prepare Input */ Job prepareAssignments = prepareJob(edgesPath, zoneAssignmentsPath, SequenceFileInputFormat.class, PrepareAssignmentsFileMapper.class, Vertex.class, Vertex.class, PrepareAssignmentsFileReducer.class, Vertex.class, FlaggedVertex.class, SequenceFileOutputFormat.class); prepareAssignments.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * As long as there may be zones connected */ while (true) { Path scatterEdgesAndAssignZoneOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis())); /* * Scatter edges and forward zone assignments, * assign one zone to edges */ Job scatterEdgesAndAssignZone = prepareJob( new Path(zoneAssignmentsPath.toString() + "," + edgesPath.toString()), scatterEdgesAndAssignZoneOutputPath, SequenceFileInputFormat.class, ScatterEdgesAndForwardZoneAssignmentsMapper.class, JoinableVertex.class, FlaggedVertex.class, AssignOneZoneToEdgesReducer.class, UndirectedEdge.class, Vertex.class, SequenceFileOutputFormat.class); scatterEdgesAndAssignZone.setGroupingComparatorClass(JoinableVertex.GroupingComparator.class); scatterEdgesAndAssignZone.waitForCompletion(true); Path findInterzoneEdgesOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis())); /* * Find interzone edges */ Job findInterzoneEdges = prepareJob(scatterEdgesAndAssignZoneOutputPath, findInterzoneEdgesOutputPath, SequenceFileInputFormat.class, Mapper.class, UndirectedEdge.class, Vertex.class, FindInterzoneEdgesReducer.class, Vertex.class, FlaggedVertex.class, SequenceFileOutputFormat.class); findInterzoneEdges.waitForCompletion(true); /* * Break if there are no new interzone edges */ if (findInterzoneEdges.getCounters().findCounter(Counter.ZONES_CONNECTED).getValue() == 0L) { break; } Path assignNewZonesOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis())); /* * Assign new zones */ Job assignNewZones = prepareJob( new Path(zoneAssignmentsPath.toString() + "," + findInterzoneEdgesOutputPath.toString()), assignNewZonesOutputPath, SequenceFileInputFormat.class, BinZoneAssignmentsAndInterzoneEdgesMapper.class, JoinableVertex.class, FlaggedVertex.class, AssignNewZonesToVerticesReducer.class, Vertex.class, FlaggedVertex.class, SequenceFileOutputFormat.class); assignNewZones.setGroupingComparatorClass(JoinableVertex.GroupingComparator.class); assignNewZones.waitForCompletion(true); zoneAssignmentsPath = assignNewZonesOutputPath; } } FileSystem system = FileSystem.get(getConf()); FileUtil.copy(system, zoneAssignmentsPath, system, outputPath, false, getConf()); return 0; }
From source file:org.apache.mahout.graph.triangles.EnumerateTrianglesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w ww . j a va 2 s. c o m addOutputOption(); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } // scatter the edges to lower degree vertex and build open triads Job scatter = prepareJob(getInputPath(), getTempPath(TMP_OPEN_TRIADS), SequenceFileInputFormat.class, ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class, BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class, SequenceFileOutputFormat.class); scatter.waitForCompletion(true); // necessary as long as we don't have access to an undeprecated MultipleInputs Job prepareInput = prepareJob(getInputPath(), getTempPath(TMP_CLOSING_EDGES), SequenceFileInputFormat.class, PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class, SequenceFileOutputFormat.class); prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class); prepareInput.waitForCompletion(true); //join opentriads and edges pairwise to get all triangles Job joinTriads = prepareJob(getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(), SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, JoinTrianglesReducer.class, Triangle.class, NullWritable.class, SequenceFileOutputFormat.class); joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class); joinTriads.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();/*from w w w . j ava2 s . co m*/ addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix"); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns")); String similarityClassnameArg = parsedArgs.get("--similarityClassname"); String distributedSimilarityClassname; try { distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg) .getSimilarityImplementationClassName(); } catch (IllegalArgumentException iae) { distributedSimilarityClassname = similarityClassnameArg; } int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path weightsPath = new Path(tempDirPath, "weights"); Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class, VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class, VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class); weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); weights.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class, CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, SequenceFileOutputFormat.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseSimilarity.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class, Mapper.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); asMatrix.setPartitionerClass(HashPartitioner.class); asMatrix.setGroupingComparatorClass( SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); asMatrix.waitForCompletion(true); } return 0; }
From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/* ww w .j a v a 2 s .c o m*/ */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); job.waitForCompletion(true); return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:org.apache.mahout.vectorizer.collocations.llr.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/*from w w w . j a v a 2 s .c o m*/ */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:org.apache.mrql.GroupByJoinPlan.java
License:Apache License
/** the GroupByJoin operation: * an equi-join combined with a group-by implemented using hashing * @param left_join_key_fnc left join key function from a to k * @param right_join_key_fnc right join key function from b to k * @param left_groupby_fnc left group-by function from a to k1 * @param right_groupby_fnc right group-by function from b to k2 * @param accumulator_fnc accumulator function from (c,(a,b)) to c * @param zero the left zero of accumulator of type c * @param reduce_fnc reduce function from ((k1,k2),c) to d * @param X left data set of type {a} * @param Y right data set of type {b} * @param num_reducers number of reducers * @param n left dimension of the reducer grid * @param m right dimension of the reducer grid * @param stop_counter optional counter used in repeat operation * @return a DataSet that contains the result of type {d} *///from w ww . j av a 2 s . c o m public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function Tree right_join_key_fnc, // right join key function Tree left_groupby_fnc, // left group-by function Tree right_groupby_fnc, // right group-by function Tree accumulator_fnc, // accumulator function Tree zero, // the left zero of accumulator Tree reduce_fnc, // reduce function DataSet X, // left data set DataSet Y, // right data set int num_reducers, // number of reducers int n, int m, // dimensions of the reducer grid String stop_counter) // optional counter used in repeat operation throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.join.key.left", left_join_key_fnc.toString()); conf.set("mrql.join.key.right", right_join_key_fnc.toString()); conf.set("mrql.groupby.left", left_groupby_fnc.toString()); conf.set("mrql.groupby.right", right_groupby_fnc.toString()); conf.setInt("mrql.m", m); conf.setInt("mrql.n", n); conf.set("mrql.accumulator", accumulator_fnc.toString()); conf.set("mrql.zero", zero.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(GroupByJoinKey.class); job.setJarByClass(GroupByJoinPlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(GroupByJoinPartitioner.class); job.setSortComparatorClass(GroupByJoinSortComparator.class); job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); job.setReducerClass(JoinReducer.class); if (num_reducers > 0) job.setNumReduceTasks(num_reducers); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = false; return new DataSet(s, c, MapReducePlan.outputRecords(job)); }
From source file:org.apache.mrql.JoinOperation.java
License:Apache License
/** The MapReduce2 physical operator (a reduce-side join) * @param mx left mapper function * @param my right mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X left data set// w ww .j a v a 2 s . c om * @param Y right data set * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce2(Tree mx, // left mapper function Tree my, // right mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // left data set DataSet Y, // right data set int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper.left", mx.toString()); conf.set("mrql.mapper.right", my.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(JoinKey.class); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerJoinPartitioner.class); job.setSortComparatorClass(MRContainerSortComparator.class); job.setGroupingComparatorClass(MRContainerGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc)) System.out.println("Streamed MapReduce2 reducer"); job.setReducerClass(JoinReducer.class); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.mrql.MapReduceOperation.java
License:Apache License
/** * The MapReduce physical operator// w w w. ja va 2s . com * @param map_fnc the mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc the reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param source the input data source * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce(Tree map_fnc, // mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet source, // input data source int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper", map_fnc.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { // will use in-mapper combiner conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(source, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerPartitioner.class); job.setSortComparatorClass(MRContainerKeyComparator.class); job.setGroupingComparatorClass(MRContainerKeyComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); for (DataSource p : source.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setReducerClass(MRReducer.class); if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc)) System.out.println("Streamed MapReduce reducer"); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }