Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

List of usage examples for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException 

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:org.apache.mahout.cf.taste.hadoop.als.PredictionJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addOption("pairs", "p", "path containing the test ratings, each line must be: userID,itemID", true);
    addOption("userFeatures", "u", "path to the user feature matrix", true);
    addOption("itemFeatures", "i", "path to the item feature matrix", true);
    addOutputOption();// ww  w  .j  a v  a 2s .c o m

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path pairs = new Path(parsedArgs.get("--pairs"));
    Path userFeatures = new Path(parsedArgs.get("--userFeatures"));
    Path itemFeatures = new Path(parsedArgs.get("--itemFeatures"));

    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path convertedPairs = new Path(tempDirPath, "convertedPairs");
    Path convertedUserFeatures = new Path(tempDirPath, "convertedUserFeatures");
    Path convertedItemFeatures = new Path(tempDirPath, "convertedItemFeatures");

    Path pairsJoinedWithItemFeatures = new Path(tempDirPath, "pairsJoinedWithItemFeatures");

    Job convertPairs = prepareJob(pairs, convertedPairs, TextInputFormat.class, PairsMapper.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertPairs.waitForCompletion(true);

    Job convertUserFeatures = prepareJob(userFeatures, convertedUserFeatures, SequenceFileInputFormat.class,
            FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertUserFeatures.waitForCompletion(true);

    Job convertItemFeatures = prepareJob(itemFeatures, convertedItemFeatures, SequenceFileInputFormat.class,
            FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertItemFeatures.waitForCompletion(true);

    Job joinPairsWithItemFeatures = prepareJob(new Path(convertedPairs + "," + convertedItemFeatures),
            pairsJoinedWithItemFeatures, SequenceFileInputFormat.class, Mapper.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, JoinProbesWithItemFeaturesReducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    joinPairsWithItemFeatures.setPartitionerClass(HashPartitioner.class);
    joinPairsWithItemFeatures.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class);
    joinPairsWithItemFeatures.waitForCompletion(true);

    Job predictRatings = prepareJob(new Path(pairsJoinedWithItemFeatures + "," + convertedUserFeatures),
            getOutputPath(), SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class,
            VectorWithIndexWritable.class, PredictRatingReducer.class, Text.class, NullWritable.class,
            TextOutputFormat.class);
    predictRatings.setPartitionerClass(HashPartitioner.class);
    predictRatings.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class);
    predictRatings.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.graph.common.EnumerateTrianglesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from  w w w . ja va2  s. com*/
    addOutputOption();
    addOption("text", "t", "output in textformat?", String.valueOf(Boolean.FALSE));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Class<? extends FileOutputFormat> outputFormat = Boolean.parseBoolean(parsedArgs.get("--text"))
            ? TextOutputFormat.class
            : SequenceFileOutputFormat.class;

    /* scatter the edges to each of the vertices and count degree */
    Job scatter = prepareJob(getInputPath(), getTempPath(TMP_AUGMENTED_EDGES), ScatterEdgesMapper.class,
            Vertex.class, Vertex.class, SumDegreesReducer.class, UndirectedEdge.class, VertexWithDegree.class);
    scatter.waitForCompletion(true);

    /* join augmented edges with partial degree information to to complete records */
    Job join = prepareJob(getTempPath(TMP_AUGMENTED_EDGES), getTempPath(TMP_EDGES_WITH_DEGREES), Mapper.class,
            UndirectedEdge.class, VertexWithDegree.class, JoinDegreesReducer.class,
            UndirectedEdgeWithDegrees.class, NullWritable.class);
    join.waitForCompletion(true);

    /* scatter the edges to lower degree vertex and build open triads */
    Job scatterToLower = prepareJob(getTempPath(TMP_EDGES_WITH_DEGREES), getTempPath(TMP_OPEN_TRIADS),
            ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class,
            BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class);
    scatterToLower.waitForCompletion(true);

    /* necessary as long as we don't have access to an undeprecated MultipleInputs  */
    Job prepareInput = prepareJob(getTempPath(TMP_EDGES_WITH_DEGREES), getTempPath(TMP_CLOSING_EDGES),
            PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class,
            JoinableUndirectedEdge.class, VertexOrMarker.class);
    prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    prepareInput.waitForCompletion(true);

    /* join opentriads and edges pairwise to get all triangles */
    Job joinTriads = prepareJob(getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(),
            SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class,
            JoinTrianglesReducer.class, Triangle.class, NullWritable.class, outputFormat);
    joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    joinTriads.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.graph.components.FindComponentsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from w  w  w.  j ava2 s  . c  o  m*/
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();

    AtomicInteger currentPhase = new AtomicInteger();

    Path edgesPath = inputPath;
    Path zoneAssignmentsPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis()));

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        /*
         * Prepare Input
         */
        Job prepareAssignments = prepareJob(edgesPath, zoneAssignmentsPath, SequenceFileInputFormat.class,
                PrepareAssignmentsFileMapper.class, Vertex.class, Vertex.class,
                PrepareAssignmentsFileReducer.class, Vertex.class, FlaggedVertex.class,
                SequenceFileOutputFormat.class);

        prepareAssignments.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /*
         * As long as there may be zones connected
         */
        while (true) {

            Path scatterEdgesAndAssignZoneOutputPath = new Path(tempDirPath,
                    String.valueOf(System.currentTimeMillis()));

            /*
             * Scatter edges and forward zone assignments,
             * assign one zone to edges
             */
            Job scatterEdgesAndAssignZone = prepareJob(
                    new Path(zoneAssignmentsPath.toString() + "," + edgesPath.toString()),
                    scatterEdgesAndAssignZoneOutputPath, SequenceFileInputFormat.class,
                    ScatterEdgesAndForwardZoneAssignmentsMapper.class, JoinableVertex.class,
                    FlaggedVertex.class, AssignOneZoneToEdgesReducer.class, UndirectedEdge.class, Vertex.class,
                    SequenceFileOutputFormat.class);
            scatterEdgesAndAssignZone.setGroupingComparatorClass(JoinableVertex.GroupingComparator.class);
            scatterEdgesAndAssignZone.waitForCompletion(true);

            Path findInterzoneEdgesOutputPath = new Path(tempDirPath,
                    String.valueOf(System.currentTimeMillis()));

            /*
             * Find interzone edges
             */
            Job findInterzoneEdges = prepareJob(scatterEdgesAndAssignZoneOutputPath,
                    findInterzoneEdgesOutputPath, SequenceFileInputFormat.class, Mapper.class,
                    UndirectedEdge.class, Vertex.class, FindInterzoneEdgesReducer.class, Vertex.class,
                    FlaggedVertex.class, SequenceFileOutputFormat.class);

            findInterzoneEdges.waitForCompletion(true);

            /*
             * Break if there are no new interzone edges
             */
            if (findInterzoneEdges.getCounters().findCounter(Counter.ZONES_CONNECTED).getValue() == 0L) {
                break;
            }

            Path assignNewZonesOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis()));

            /*
             * Assign new zones
             */
            Job assignNewZones = prepareJob(
                    new Path(zoneAssignmentsPath.toString() + "," + findInterzoneEdgesOutputPath.toString()),
                    assignNewZonesOutputPath, SequenceFileInputFormat.class,
                    BinZoneAssignmentsAndInterzoneEdgesMapper.class, JoinableVertex.class, FlaggedVertex.class,
                    AssignNewZonesToVerticesReducer.class, Vertex.class, FlaggedVertex.class,
                    SequenceFileOutputFormat.class);

            assignNewZones.setGroupingComparatorClass(JoinableVertex.GroupingComparator.class);
            assignNewZones.waitForCompletion(true);

            zoneAssignmentsPath = assignNewZonesOutputPath;
        }
    }
    FileSystem system = FileSystem.get(getConf());
    FileUtil.copy(system, zoneAssignmentsPath, system, outputPath, false, getConf());
    return 0;
}

From source file:org.apache.mahout.graph.triangles.EnumerateTrianglesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from w  ww .  j  a  va 2  s.  c  o m
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    // scatter the edges to lower degree vertex and build open triads
    Job scatter = prepareJob(getInputPath(), getTempPath(TMP_OPEN_TRIADS), SequenceFileInputFormat.class,
            ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class,
            BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class,
            SequenceFileOutputFormat.class);
    scatter.waitForCompletion(true);

    // necessary as long as we don't have access to an undeprecated MultipleInputs
    Job prepareInput = prepareJob(getInputPath(), getTempPath(TMP_CLOSING_EDGES), SequenceFileInputFormat.class,
            PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class,
            JoinableUndirectedEdge.class, VertexOrMarker.class, SequenceFileOutputFormat.class);
    prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    prepareInput.waitForCompletion(true);

    //join opentriads and edges pairwise to get all triangles
    Job joinTriads = prepareJob(getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(),
            SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class,
            JoinTrianglesReducer.class, Triangle.class, NullWritable.class, SequenceFileOutputFormat.class);
    joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    joinTriads.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();/*from   w w  w .  j  ava2  s  .  co  m*/
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix");
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
    String similarityClassnameArg = parsedArgs.get("--similarityClassname");
    String distributedSimilarityClassname;
    try {
        distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg)
                .getSimilarityImplementationClassName();
    } catch (IllegalArgumentException iae) {
        distributedSimilarityClassname = similarityClassnameArg;
    }

    int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path weightsPath = new Path(tempDirPath, "weights");
    Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class,
                VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class,
                VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class);

        weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        weights.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class,
                CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class,
                SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, SequenceFileOutputFormat.class);

        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseSimilarity.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class,
                EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        asMatrix.setPartitionerClass(HashPartitioner.class);
        asMatrix.setGroupingComparatorClass(
                SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        asMatrix.waitForCompletion(true);
    }

    return 0;
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/* ww w .j  a v a 2  s  .c  o m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    job.waitForCompletion(true);

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:org.apache.mahout.vectorizer.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from w w w .  j  a v  a 2 s  .c o m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:org.apache.mrql.GroupByJoinPlan.java

License:Apache License

/** the GroupByJoin operation:
 *      an equi-join combined with a group-by implemented using hashing
 * @param left_join_key_fnc   left join key function from a to k
 * @param right_join_key_fnc  right join key function from b to k
 * @param left_groupby_fnc    left group-by function from a to k1
 * @param right_groupby_fnc   right group-by function from b to k2
 * @param accumulator_fnc     accumulator function from (c,(a,b)) to c
 * @param zero                the left zero of accumulator of type c
 * @param reduce_fnc          reduce function from ((k1,k2),c) to d
 * @param X                   left data set of type {a}
 * @param Y                   right data set of type {b}
 * @param num_reducers        number of reducers
 * @param n                   left dimension of the reducer grid
 * @param m                   right dimension of the reducer grid
 * @param stop_counter        optional counter used in repeat operation
 * @return a DataSet that contains the result of type {d}
 *///from   w ww  . j av  a  2 s . c  o  m
public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function
        Tree right_join_key_fnc, // right join key function
        Tree left_groupby_fnc, // left group-by function
        Tree right_groupby_fnc, // right group-by function
        Tree accumulator_fnc, // accumulator function
        Tree zero, // the left zero of accumulator
        Tree reduce_fnc, // reduce function
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reducers, // number of reducers
        int n, int m, // dimensions of the reducer grid
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.join.key.left", left_join_key_fnc.toString());
    conf.set("mrql.join.key.right", right_join_key_fnc.toString());
    conf.set("mrql.groupby.left", left_groupby_fnc.toString());
    conf.set("mrql.groupby.right", right_groupby_fnc.toString());
    conf.setInt("mrql.m", m);
    conf.setInt("mrql.n", n);
    conf.set("mrql.accumulator", accumulator_fnc.toString());
    conf.set("mrql.zero", zero.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(GroupByJoinKey.class);
    job.setJarByClass(GroupByJoinPlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(GroupByJoinPartitioner.class);
    job.setSortComparatorClass(GroupByJoinSortComparator.class);
    job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    job.setReducerClass(JoinReducer.class);
    if (num_reducers > 0)
        job.setNumReduceTasks(num_reducers);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = false;
    return new DataSet(s, c, MapReducePlan.outputRecords(job));
}

From source file:org.apache.mrql.JoinOperation.java

License:Apache License

/** The MapReduce2 physical operator (a reduce-side join)
 * @param mx             left mapper function
 * @param my             right mapper function
 * @param combine_fnc    optional in-mapper combiner function
 * @param reduce_fnc     reducer function
 * @param acc_fnc        optional accumulator function
 * @param zero           optional the zero value for the accumulator
 * @param X              left data set//  w ww .j a  v  a  2 s  .  c  om
 * @param Y              right data set
 * @param num_reduces    number of reducers
 * @param stop_counter   optional counter used in repeat operation
 * @param orderp         does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce2(Tree mx, // left mapper function
        Tree my, // right mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper.left", mx.toString());
    conf.set("mrql.mapper.right", my.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(JoinKey.class);
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerJoinPartitioner.class);
    job.setSortComparatorClass(MRContainerSortComparator.class);
    job.setGroupingComparatorClass(MRContainerGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce2 reducer");
    job.setReducerClass(JoinReducer.class);
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.mrql.MapReduceOperation.java

License:Apache License

/**
 * The MapReduce physical operator// w  w  w.  ja  va  2s  .  com
 * @param map_fnc          the mapper function
 * @param combine_fnc      optional in-mapper combiner function
 * @param reduce_fnc       the reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param source           the input data source
 * @param num_reduces      number of reducers
 * @param stop_counter     optional counter used in repeat operation
 * @param orderp           does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce(Tree map_fnc, // mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet source, // input data source
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper", map_fnc.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) { // will use in-mapper combiner
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(source, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerPartitioner.class);
    job.setSortComparatorClass(MRContainerKeyComparator.class);
    job.setGroupingComparatorClass(MRContainerKeyComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p : source.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setReducerClass(MRReducer.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce reducer");
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}