Example usage for org.apache.hadoop.mapred JobConf setOutputValueGroupingComparator

List of usage examples for org.apache.hadoop.mapred JobConf setOutputValueGroupingComparator

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputValueGroupingComparator.

Prototype

public void setOutputValueGroupingComparator(Class<? extends RawComparator> theClass) 

Source Link

Document

Set the user defined RawComparator comparator for grouping keys in the input to the reduce.

Usage

From source file:NaivePageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    int iteration = -1;
    String inputPath = args[0];//www.  j  av a 2 s  . com
    String outputPath = args[1];
    int specIteration = 0;
    if (args.length > 2) {
        specIteration = Integer.parseInt(args[2]);
    }
    int numNodes = 100000;
    if (args.length > 3) {
        numNodes = Integer.parseInt(args[3]);
    }
    int numReducers = 32;
    if (args.length > 4) {
        numReducers = Integer.parseInt(args[4]);
    }
    System.out.println("specified iteration: " + specIteration);
    long start = System.currentTimeMillis();

    /**
     * job to count out-going links for each url
     */
    JobConf conf = new JobConf(NaivePageRank.class);
    conf.setJobName("PageRank-Count");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(CountMapper.class);
    conf.setReducerClass(CountReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/count"));
    conf.setNumReduceTasks(numReducers);
    JobClient.runJob(conf);

    /******************** Initial Rank Assignment Job ***********************/
    conf = new JobConf(NaivePageRank.class);
    conf.setJobName("PageRank-Initialize");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(InitialRankAssignmentMapper.class);
    conf.setReducerClass(InitialRankAssignmentReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
    conf.setNumReduceTasks(numReducers);
    // conf.setIterative(false);
    JobClient.runJob(conf);
    iteration++;

    do {
        /****************** Join Job ********************************/
        conf = new JobConf(NaivePageRank.class);
        conf.setJobName("PageRank-Join");
        conf.setOutputKeyClass(Text.class);
        // conf.setOutputValueClass(Text.class);
        conf.setMapperClass(ComputeRankMap.class);
        conf.setReducerClass(ComputeRankReduce.class);
        conf.setMapOutputKeyClass(TextPair.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        conf.setPartitionerClass(FirstPartitioner.class);
        conf.setOutputKeyComparatorClass(KeyComparator.class);
        conf.setOutputValueGroupingComparator(GroupComparator.class);

        // relation table
        FileInputFormat.setInputPaths(conf, new Path(inputPath));
        // rank table
        FileInputFormat.addInputPath(conf, new Path(outputPath + "/i" + (iteration - 1)));
        // count table
        FileInputFormat.addInputPath(conf, new Path(outputPath + "/count"));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
        conf.setNumReduceTasks(numReducers);
        JobClient.runJob(conf);
        iteration++;

        /******************** Rank Aggregate Job ***********************/
        conf = new JobConf(NaivePageRank.class);
        conf.setJobName("PageRank-Aggregate");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapperClass(RankAggregateMapper.class);
        conf.setReducerClass(RankAggregateReducer.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(outputPath + "/i" + (iteration - 1)));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
        conf.setNumReduceTasks(numReducers);
        conf.setInt("haloop.num.nodes", numNodes);
        JobClient.runJob(conf);
        iteration++;
    } while (iteration < 2 * specIteration);

    long end = System.currentTimeMillis();
    System.out.println("running time " + (end - start) / 1000 + "s");
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

protected JobConf getJobConf(JobConf parentConf) throws IOException {
    JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf);

    // set values first so they can't break things downstream
    if (hasProperties()) {
        for (Map.Entry entry : getProperties().entrySet())
            conf.set(entry.getKey().toString(), entry.getValue().toString());
    }/*from   w w  w . j a  v  a 2s  .c o  m*/

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepName());

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapperClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(conf);

    initFromSink(conf);

    initFromTraps(conf);

    if (sink.getScheme().getNumSinkParts() != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(sink.getScheme().getNumSinkParts());
        else
            conf.setNumMapTasks(sink.getScheme().getNumSinkParts());
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    conf.setInt("cascading.flow.step.id", id);
    conf.set("cascading.flow.step", Util.serializeBase64(this));

    return conf;
}

From source file:cascading.flow.hadoop.HadoopFlowStep.java

License:Open Source License

public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) {
    JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig);

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH)));

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapRunnerClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(flowProcess, conf);/*from  www  .jav a 2 s  .c  o  m*/

    initFromSink(flowProcess, conf);

    initFromTraps(flowProcess, conf);

    initFromStepConfigDef(conf);

    int numSinkParts = getSink().getScheme().getNumSinkParts();

    if (numSinkParts != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(numSinkParts);
        else
            conf.setNumMapTasks(numSinkParts);
    } else if (getGroup() != null) {
        int gatherPartitions = conf.getNumReduceTasks();

        if (gatherPartitions == 0)
            gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0);

        if (gatherPartitions == 0)
            throw new FlowException(getName(),
                    "a default number of gather partitions must be set, see FlowRuntimeProps");

        conf.setNumReduceTasks(gatherPartitions);
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);
        conf.setPartitionerClass(GroupingPartitioner.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this,
                    getGroup());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingSortingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    String versionString = Version.getRelease();

    if (versionString != null)
        conf.set("cascading.version", versionString);

    conf.set(CASCADING_FLOW_STEP_ID, getID());
    conf.set("cascading.flow.step.num", Integer.toString(getOrdinal()));

    HadoopUtil.setIsInflow(conf);

    Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator();

    String mapState = pack(iterator.next(), conf);
    String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf);

    // hadoop 20.2 doesn't like dist cache when using local mode
    int maxSize = Short.MAX_VALUE;

    int length = mapState.length() + reduceState.length();

    if (isHadoopLocalMode(conf) || length < maxSize) // seems safe
    {
        conf.set("cascading.flow.step.node.map", mapState);

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce", reduceState);
    } else {
        conf.set("cascading.flow.step.node.map.path",
                HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState));

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce.path",
                    HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState));
    }

    return conf;
}

From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java

License:Apache License

public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: urlresolution <url-map> <url-register> <out>");
        System.exit(2);/*from w w  w  . j  a  v  a2s . c  om*/
    }
    JobConf job = new JobConf(conf);
    FileSystem fS = FileSystem.get(conf);
    fS.delete(new Path(otherArgs[2]), true);

    MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class);
    MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class);

    job.setJarByClass(HadoopUrlResolution.class);

    job.setPartitionerClass(KeyPartitioner.class);
    job.setOutputValueGroupingComparator(GroupingComparator.class);

    job.setMapOutputKeyClass(UrlRegJoinUrlMap.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

    Job j = new Job(job);
    j.setReducerClass(Reduce.class);
    j.waitForCompletion(true);
}

From source file:com.hadoop.secondarysort.SecondarySort_MapRed.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);//from  www. j  a v a  2  s  . co  m
    }

    JobConf jobConf = new JobConf(conf);
    jobConf.setMapperClass(MapClass.class);
    jobConf.setReducerClass(Reduce.class);

    jobConf.setPartitionerClass(FirstPartitioner.class);
    jobConf.setOutputValueGroupingComparator(FirstGroupingComparator.class);

    jobConf.setMapOutputKeyClass(IntPair.class);
    jobConf.setMapOutputValueClass(IntWritable.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(IntWritable.class);

    //
    // Job job = new Job(conf, "secondary sort");
    // job.setJarByClass(SecondarySort_MapRed.class);
    // job.setMapperClass(MapClass.class);
    // job.setReducerClass(Reduce.class);
    //
    // // group and partition by the first int in the pair
    // job.setPartitionerClass(FirstPartitioner.class);
    // job.setGroupingComparatorClass(FirstGroupingComparator.class);
    // conf.setClass("mapred.output.key.comparator.class",
    // KeyComparator.class, RawComparator.class);
    // // job.setSortComparatorClass(SecondGroupingComparator.class);
    // // the map output is IntPair, IntWritable
    // job.setMapOutputKeyClass(IntPair.class);
    // job.setMapOutputValueClass(IntWritable.class);
    //
    // // the reduce output is Text, IntWritable
    // job.setOutputKeyClass(Text.class);
    // job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(jobConf, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs[1]));

}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java

License:Open Source License

/**
 * /*from w w w  .  jav  a 2  s .  c o m*/
 * @param fname    null if no comparison required
 * @param fnameNew
 * @param srcFnames
 * @param ii
 * @param oi
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi,
        long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(ResultMergeRemoteMR.class);
    job.setJobName(jobname + _pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    //warning for textcell/binarycell without compare
    boolean withCompare = (fname != null);
    if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare
            && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
        LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi)
                + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");

    try {
        Path pathCompare = null;
        Path pathNew = new Path(fnameNew);

        /////
        //configure the MR job
        if (withCompare) {
            pathCompare = new Path(fname).makeQualified(FileSystem.get(job));
            MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen,
                    bclen);
        } else
            MRJobConfiguration.setResultMergeInfo(job, "null", ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen,
                    bclen);

        //set mappers, reducers, combiners
        job.setMapperClass(ResultMergeRemoteMapper.class);
        job.setReducerClass(ResultMergeRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            //setup partitioning, grouping, sorting for composite key (old API)
            job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning
            job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping
            job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting

            job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixBlock.class);
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path 
        Path[] paths = null;
        if (withCompare) {
            paths = new Path[srcFnames.length + 1];
            paths[0] = pathCompare;
            for (int i = 1; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i - 1]);
        } else {
            paths = new Path[srcFnames.length];
            for (int i = 0; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i]);
        }
        FileInputFormat.setInputPaths(job, paths);

        //set output format
        job.setOutputFormat(oi.outputFormatClass);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        FileOutputFormat.setOutputPath(job, pathNew);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = _numReducers;
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
        else //textcell/binarycell
            reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", _numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", _numMappers);
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   

        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}

From source file:com.manning.hip.ch4.joins.improved.impl.OptimizedDataJoinJob.java

License:Apache License

public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];//from w ww. ja  v  a2  s  .  c o  m
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
        System.out.println("Using TextInputFormat: " + args[2]);
        inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileOutputFormat: " + args[7]);
        outputFormat = SequenceFileOutputFormat.class;
        outputValueClass = getClassByName(args[7]);
    } else {
        System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
        maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
        jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, OptimizedDataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir));
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(CompositeKey.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setPartitionerClass(CompositeKeyPartitioner.class);
    job.setOutputKeyComparatorClass(CompositeKeyComparator.class);
    job.setOutputValueGroupingComparator(CompositeKeyOnlyComparator.class);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 3) {
            JobBuilder.printUsage(this, "<ncdc input> <station input> <output>");
            return -1;
        }//  w  w  w.ja v  a2s.com

        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Join record with station name");

        Path ncdcInputPath = new Path(args[0]);
        Path stationInputPath = new Path(args[1]);
        Path outputPath = new Path(args[2]);

        MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class);
        MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class);
        FileOutputFormat.setOutputPath(conf, outputPath);

        /*[*/conf.setPartitionerClass(KeyPartitioner.class);
        conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);/*]*/

        conf.setMapOutputKeyClass(TextPair.class);

        conf.setReducerClass(JoinReducer.class);

        conf.setOutputKeyClass(Text.class);

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws IOException {
        JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
        if (conf == null) {
            return -1;
        }/*from   w w w . j  ava 2 s.c  om*/

        conf.setMapperClass(MaxTemperatureMapper.class);
        /*[*/conf.setPartitionerClass(FirstPartitioner.class);
        /*]*/
        /*[*/conf.setOutputKeyComparatorClass(KeyComparator.class);
        /*]*/
        /*[*/conf.setOutputValueGroupingComparator(GroupComparator.class);/*]*/
        conf.setReducerClass(MaxTemperatureReducer.class);
        conf.setOutputKeyClass(IntPair.class);
        conf.setOutputValueClass(NullWritable.class);

        JobClient.runJob(conf);
        return 0;
    }

From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args//from  w w  w.  j a v  a 2  s  . c o m
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(TerrierIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}