Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:org.apache.mahout.cf.taste.hadoop.als.PredictionJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addOption("pairs", "p", "path containing the test ratings, each line must be: userID,itemID", true);
    addOption("userFeatures", "u", "path to the user feature matrix", true);
    addOption("itemFeatures", "i", "path to the item feature matrix", true);
    addOutputOption();// ww  w  .j  a v  a 2s .c o m

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path pairs = new Path(parsedArgs.get("--pairs"));
    Path userFeatures = new Path(parsedArgs.get("--userFeatures"));
    Path itemFeatures = new Path(parsedArgs.get("--itemFeatures"));

    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path convertedPairs = new Path(tempDirPath, "convertedPairs");
    Path convertedUserFeatures = new Path(tempDirPath, "convertedUserFeatures");
    Path convertedItemFeatures = new Path(tempDirPath, "convertedItemFeatures");

    Path pairsJoinedWithItemFeatures = new Path(tempDirPath, "pairsJoinedWithItemFeatures");

    Job convertPairs = prepareJob(pairs, convertedPairs, TextInputFormat.class, PairsMapper.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertPairs.waitForCompletion(true);

    Job convertUserFeatures = prepareJob(userFeatures, convertedUserFeatures, SequenceFileInputFormat.class,
            FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertUserFeatures.waitForCompletion(true);

    Job convertItemFeatures = prepareJob(itemFeatures, convertedItemFeatures, SequenceFileInputFormat.class,
            FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    convertItemFeatures.waitForCompletion(true);

    Job joinPairsWithItemFeatures = prepareJob(new Path(convertedPairs + "," + convertedItemFeatures),
            pairsJoinedWithItemFeatures, SequenceFileInputFormat.class, Mapper.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, JoinProbesWithItemFeaturesReducer.class,
            TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class);
    joinPairsWithItemFeatures.setPartitionerClass(HashPartitioner.class);
    joinPairsWithItemFeatures.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class);
    joinPairsWithItemFeatures.waitForCompletion(true);

    Job predictRatings = prepareJob(new Path(pairsJoinedWithItemFeatures + "," + convertedUserFeatures),
            getOutputPath(), SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class,
            VectorWithIndexWritable.class, PredictRatingReducer.class, Text.class, NullWritable.class,
            TextOutputFormat.class);
    predictRatings.setPartitionerClass(HashPartitioner.class);
    predictRatings.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class);
    predictRatings.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.graph.common.EnumerateTrianglesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from  w w w . ja va2  s. com*/
    addOutputOption();
    addOption("text", "t", "output in textformat?", String.valueOf(Boolean.FALSE));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Class<? extends FileOutputFormat> outputFormat = Boolean.parseBoolean(parsedArgs.get("--text"))
            ? TextOutputFormat.class
            : SequenceFileOutputFormat.class;

    /* scatter the edges to each of the vertices and count degree */
    Job scatter = prepareJob(getInputPath(), getTempPath(TMP_AUGMENTED_EDGES), ScatterEdgesMapper.class,
            Vertex.class, Vertex.class, SumDegreesReducer.class, UndirectedEdge.class, VertexWithDegree.class);
    scatter.waitForCompletion(true);

    /* join augmented edges with partial degree information to to complete records */
    Job join = prepareJob(getTempPath(TMP_AUGMENTED_EDGES), getTempPath(TMP_EDGES_WITH_DEGREES), Mapper.class,
            UndirectedEdge.class, VertexWithDegree.class, JoinDegreesReducer.class,
            UndirectedEdgeWithDegrees.class, NullWritable.class);
    join.waitForCompletion(true);

    /* scatter the edges to lower degree vertex and build open triads */
    Job scatterToLower = prepareJob(getTempPath(TMP_EDGES_WITH_DEGREES), getTempPath(TMP_OPEN_TRIADS),
            ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class,
            BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class);
    scatterToLower.waitForCompletion(true);

    /* necessary as long as we don't have access to an undeprecated MultipleInputs  */
    Job prepareInput = prepareJob(getTempPath(TMP_EDGES_WITH_DEGREES), getTempPath(TMP_CLOSING_EDGES),
            PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class,
            JoinableUndirectedEdge.class, VertexOrMarker.class);
    prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    prepareInput.waitForCompletion(true);

    /* join opentriads and edges pairwise to get all triangles */
    Job joinTriads = prepareJob(getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(),
            SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class,
            JoinTrianglesReducer.class, Triangle.class, NullWritable.class, outputFormat);
    joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    joinTriads.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.graph.components.FindComponentsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from w  w  w.  j ava2 s  . c  o  m*/
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();

    AtomicInteger currentPhase = new AtomicInteger();

    Path edgesPath = inputPath;
    Path zoneAssignmentsPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis()));

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        /*
         * Prepare Input
         */
        Job prepareAssignments = prepareJob(edgesPath, zoneAssignmentsPath, SequenceFileInputFormat.class,
                PrepareAssignmentsFileMapper.class, Vertex.class, Vertex.class,
                PrepareAssignmentsFileReducer.class, Vertex.class, FlaggedVertex.class,
                SequenceFileOutputFormat.class);

        prepareAssignments.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /*
         * As long as there may be zones connected
         */
        while (true) {

            Path scatterEdgesAndAssignZoneOutputPath = new Path(tempDirPath,
                    String.valueOf(System.currentTimeMillis()));

            /*
             * Scatter edges and forward zone assignments,
             * assign one zone to edges
             */
            Job scatterEdgesAndAssignZone = prepareJob(
                    new Path(zoneAssignmentsPath.toString() + "," + edgesPath.toString()),
                    scatterEdgesAndAssignZoneOutputPath, SequenceFileInputFormat.class,
                    ScatterEdgesAndForwardZoneAssignmentsMapper.class, JoinableVertex.class,
                    FlaggedVertex.class, AssignOneZoneToEdgesReducer.class, UndirectedEdge.class, Vertex.class,
                    SequenceFileOutputFormat.class);
            scatterEdgesAndAssignZone.setGroupingComparatorClass(JoinableVertex.GroupingComparator.class);
            scatterEdgesAndAssignZone.waitForCompletion(true);

            Path findInterzoneEdgesOutputPath = new Path(tempDirPath,
                    String.valueOf(System.currentTimeMillis()));

            /*
             * Find interzone edges
             */
            Job findInterzoneEdges = prepareJob(scatterEdgesAndAssignZoneOutputPath,
                    findInterzoneEdgesOutputPath, SequenceFileInputFormat.class, Mapper.class,
                    UndirectedEdge.class, Vertex.class, FindInterzoneEdgesReducer.class, Vertex.class,
                    FlaggedVertex.class, SequenceFileOutputFormat.class);

            findInterzoneEdges.waitForCompletion(true);

            /*
             * Break if there are no new interzone edges
             */
            if (findInterzoneEdges.getCounters().findCounter(Counter.ZONES_CONNECTED).getValue() == 0L) {
                break;
            }

            Path assignNewZonesOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis()));

            /*
             * Assign new zones
             */
            Job assignNewZones = prepareJob(
                    new Path(zoneAssignmentsPath.toString() + "," + findInterzoneEdgesOutputPath.toString()),
                    assignNewZonesOutputPath, SequenceFileInputFormat.class,
                    BinZoneAssignmentsAndInterzoneEdgesMapper.class, JoinableVertex.class, FlaggedVertex.class,
                    AssignNewZonesToVerticesReducer.class, Vertex.class, FlaggedVertex.class,
                    SequenceFileOutputFormat.class);

            assignNewZones.setGroupingComparatorClass(JoinableVertex.GroupingComparator.class);
            assignNewZones.waitForCompletion(true);

            zoneAssignmentsPath = assignNewZonesOutputPath;
        }
    }
    FileSystem system = FileSystem.get(getConf());
    FileUtil.copy(system, zoneAssignmentsPath, system, outputPath, false, getConf());
    return 0;
}

From source file:org.apache.mahout.graph.triangles.EnumerateTrianglesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from w  ww .  j  a  va 2  s.  c  o m
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    // scatter the edges to lower degree vertex and build open triads
    Job scatter = prepareJob(getInputPath(), getTempPath(TMP_OPEN_TRIADS), SequenceFileInputFormat.class,
            ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class,
            BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class,
            SequenceFileOutputFormat.class);
    scatter.waitForCompletion(true);

    // necessary as long as we don't have access to an undeprecated MultipleInputs
    Job prepareInput = prepareJob(getInputPath(), getTempPath(TMP_CLOSING_EDGES), SequenceFileInputFormat.class,
            PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class,
            JoinableUndirectedEdge.class, VertexOrMarker.class, SequenceFileOutputFormat.class);
    prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    prepareInput.waitForCompletion(true);

    //join opentriads and edges pairwise to get all triangles
    Job joinTriads = prepareJob(getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(),
            SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class,
            JoinTrianglesReducer.class, Triangle.class, NullWritable.class, SequenceFileOutputFormat.class);
    joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    joinTriads.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();/*from   w w  w .  j  ava2  s  .  co  m*/
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix");
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
    String similarityClassnameArg = parsedArgs.get("--similarityClassname");
    String distributedSimilarityClassname;
    try {
        distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg)
                .getSimilarityImplementationClassName();
    } catch (IllegalArgumentException iae) {
        distributedSimilarityClassname = similarityClassnameArg;
    }

    int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path weightsPath = new Path(tempDirPath, "weights");
    Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class,
                VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class,
                VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class);

        weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        weights.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class,
                CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class,
                SimilarityMatrixEntryKey.class, MatrixEntryWritable.class, SequenceFileOutputFormat.class);

        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseSimilarity.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, SimilarityMatrixEntryKey.class, MatrixEntryWritable.class,
                EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        asMatrix.setPartitionerClass(HashPartitioner.class);
        asMatrix.setGroupingComparatorClass(
                SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        asMatrix.waitForCompletion(true);
    }

    return 0;
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/* ww w .j  a v a 2  s  .c  o m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    job.waitForCompletion(true);

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:org.apache.mahout.vectorizer.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from w w w .  j  a v  a 2 s  .c o m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);

    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(reduceTasks);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:org.apache.mrql.GroupByJoinPlan.java

License:Apache License

/** the GroupByJoin operation:
 *      an equi-join combined with a group-by implemented using hashing
 * @param left_join_key_fnc   left join key function from a to k
 * @param right_join_key_fnc  right join key function from b to k
 * @param left_groupby_fnc    left group-by function from a to k1
 * @param right_groupby_fnc   right group-by function from b to k2
 * @param accumulator_fnc     accumulator function from (c,(a,b)) to c
 * @param zero                the left zero of accumulator of type c
 * @param reduce_fnc          reduce function from ((k1,k2),c) to d
 * @param X                   left data set of type {a}
 * @param Y                   right data set of type {b}
 * @param num_reducers        number of reducers
 * @param n                   left dimension of the reducer grid
 * @param m                   right dimension of the reducer grid
 * @param stop_counter        optional counter used in repeat operation
 * @return a DataSet that contains the result of type {d}
 *///from   w ww  . j av  a  2 s . c  o  m
public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function
        Tree right_join_key_fnc, // right join key function
        Tree left_groupby_fnc, // left group-by function
        Tree right_groupby_fnc, // right group-by function
        Tree accumulator_fnc, // accumulator function
        Tree zero, // the left zero of accumulator
        Tree reduce_fnc, // reduce function
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reducers, // number of reducers
        int n, int m, // dimensions of the reducer grid
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.join.key.left", left_join_key_fnc.toString());
    conf.set("mrql.join.key.right", right_join_key_fnc.toString());
    conf.set("mrql.groupby.left", left_groupby_fnc.toString());
    conf.set("mrql.groupby.right", right_groupby_fnc.toString());
    conf.setInt("mrql.m", m);
    conf.setInt("mrql.n", n);
    conf.set("mrql.accumulator", accumulator_fnc.toString());
    conf.set("mrql.zero", zero.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(GroupByJoinKey.class);
    job.setJarByClass(GroupByJoinPlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(GroupByJoinPartitioner.class);
    job.setSortComparatorClass(GroupByJoinSortComparator.class);
    job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    job.setReducerClass(JoinReducer.class);
    if (num_reducers > 0)
        job.setNumReduceTasks(num_reducers);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = false;
    return new DataSet(s, c, MapReducePlan.outputRecords(job));
}

From source file:org.apache.mrql.JoinOperation.java

License:Apache License

/** The MapReduce2 physical operator (a reduce-side join)
 * @param mx             left mapper function
 * @param my             right mapper function
 * @param combine_fnc    optional in-mapper combiner function
 * @param reduce_fnc     reducer function
 * @param acc_fnc        optional accumulator function
 * @param zero           optional the zero value for the accumulator
 * @param X              left data set//  w ww .j a  v  a  2 s  .  c  om
 * @param Y              right data set
 * @param num_reduces    number of reducers
 * @param stop_counter   optional counter used in repeat operation
 * @param orderp         does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce2(Tree mx, // left mapper function
        Tree my, // right mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper.left", mx.toString());
    conf.set("mrql.mapper.right", my.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(JoinKey.class);
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerJoinPartitioner.class);
    job.setSortComparatorClass(MRContainerSortComparator.class);
    job.setGroupingComparatorClass(MRContainerGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce2 reducer");
    job.setReducerClass(JoinReducer.class);
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.mrql.MapReduceOperation.java

License:Apache License

/**
 * The MapReduce physical operator// w  w  w.  ja  va  2s  .  com
 * @param map_fnc          the mapper function
 * @param combine_fnc      optional in-mapper combiner function
 * @param reduce_fnc       the reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param source           the input data source
 * @param num_reduces      number of reducers
 * @param stop_counter     optional counter used in repeat operation
 * @param orderp           does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce(Tree map_fnc, // mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet source, // input data source
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper", map_fnc.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) { // will use in-mapper combiner
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(source, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerPartitioner.class);
    job.setSortComparatorClass(MRContainerKeyComparator.class);
    job.setGroupingComparatorClass(MRContainerKeyComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p : source.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setReducerClass(MRReducer.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce reducer");
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}