Example usage for org.apache.hadoop.mapred JobConf setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the user-defined combiner class used to combine map-outputs before being sent to the reducers.

Usage

From source file:edu.umd.cloud9.io.benchmark.HadoopSortRandomPairsOfInts.java

License:Apache License

/**
 * Runs this benchmark.//from w  ww. j a  v a  2s.  co  m
 */
public static void main(String[] args) throws IOException {
    String inputPath = "random-pairs.seq";
    String outputPath = "random-pairs.sorted";
    int numMapTasks = 1;
    int numReduceTasks = 1;

    JobConf conf = new JobConf(HadoopSortRandomPairsOfInts.class);
    conf.setJobName("SortRandomPairsOfInts");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(PairOfInts.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime;
    double duration;

    startTime = System.currentTimeMillis();

    JobClient.runJob(conf);

    duration = (System.currentTimeMillis() - startTime) / 1000.0;
    System.out.println("Job took " + duration + " seconds");
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;//from  ww  w .j ava2  s . com
    }

    sLogger.info("PageRank: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(conf);
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private void phase2(String path, int i, int j, int n, float missing) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    sLogger.info("missing PageRank mass: " + missing);
    sLogger.info("number of nodes: " + n);

    String in = path + "/iter" + sFormat.format(j) + "t";
    String out = path + "/iter" + sFormat.format(j);

    sLogger.info("PageRank: iteration " + j + ": Phase2");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);

    int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length;
    int numReduceTasks = 0;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase2");
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setFloat("MissingMass", (float) missing);
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);/*  w w w  .j a  v a  2 s  .  co  m*/
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapPageRankMassDistributionClass.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    FileSystem.get(conf).delete(new Path(out), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*from w  ww . j ava  2s  .  com*/
    }

    conf.setInt("NodeCount", n);

    Partitioner p = null;

    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }

    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }

    sLogger.info(sb.toString().trim());

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - numPartitions: " + numPartitions);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private void phase2(String path, int i, int j, int n, float missing) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    sLogger.info("missing PageRank mass: " + missing);
    sLogger.info("number of nodes: " + n);

    String in = path + "/iter" + sFormat.format(j) + "t";
    String out = path + "/iter" + sFormat.format(j);

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase2");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);

    int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length;
    int numReduceTasks = 0;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase2");
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    conf.setNumMapTasks(numMapTasks);//from w  ww  .  j  a va2s.  c o m
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapPageRankMassDistributionClass.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setFloat("MissingMass", (float) missing);
    conf.setInt("NodeCount", n);

    FileSystem.get(conf).delete(new Path(out), true);

    JobClient.runJob(conf);
}

From source file:edu.umd.cloud9.webgraph.ClueExtractLinks.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), ClueExtractLinks.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");
    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    if (!fs.exists(new Path(mappingFile))) {
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
    }//w  w w  .jav  a2 s  . co m

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    conf.setJobName("ClueExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    conf.setNumMapTasks(numMappers);
    conf.setNumReduceTasks(numReducers);
    // TODO: to read!!
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ClueExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }
    return 0;
}

From source file:edu.umd.cloud9.webgraph.ExtractLinks.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), ExtractLinks.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");
    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    if (!fs.exists(new Path(mappingFile)))
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    conf.setJobName("ExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);/*from w  w  w.  jav  a2  s.c o m*/
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);

    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umn.cs.spatialHadoop.operations.Aggregate.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing/*from  w  ww. ja  v a 2s .  c  om*/
 * @param files
 * @param params
 * @return
 * @throws IOException
 */
public static MinMax aggregateMapReduce(Path[] files, OperationsParams params) throws IOException {
    Shape plotRange = params.getShape("rect");
    JobConf job = new JobConf(params, Aggregate.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path("agg_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Aggregate");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(MinMax.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeInputFormat.class);
    job.setClass("shape", NASAPoint.class, Shape.class);
    if (plotRange != null) {
        job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    }

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);

    // Read job result
    FileStatus[] results = outFs.listStatus(outputPath);
    MinMax minMax = new MinMax();
    for (FileStatus status : results) {
        if (status.getLen() > 0 && status.getPath().getName().startsWith("part-")) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(outFs.open(status.getPath())));
            String line;
            MinMax value = new MinMax();
            while ((line = reader.readLine()) != null) {
                value.fromText(new Text(line));
                minMax.expand(value);
            }
            reader.close();
        }
    }

    outFs.delete(outputPath, true);

    return minMax;
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Calculates the union of a set of shapes categorized by some user defined
 * category./*from ww w . java2  s. c o m*/
 * @param shapeFile - Input file that contains shapes
 * @param categoryFile - Category file that contains the category of each
 *  shape.Shapes not appearing in this file are not generated in output.
 * @param output - An output file that contains each category and the union
 *  of all shapes in it. Each line contains the category, then a comma,
 *   then the union represented as text.
 * @throws IOException
 */
public static void unionMapReduce(Path shapeFile, Path categoryFile, Path output, OperationsParams params)
        throws IOException {

    JobConf job = new JobConf(params, CatUnion.class);
    job.setJobName("Union");

    // Check output file existence
    FileSystem outFs = output.getFileSystem(job);
    if (outFs.exists(output)) {
        if (params.getBoolean("overwrite", false)) {
            outFs.delete(output, true);
        } else {
            throw new RuntimeException("Output path already exists and -overwrite flag is not set");
        }
    }

    // Set map and reduce
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));

    job.setMapperClass(UnionMapper.class);
    job.setCombinerClass(UnionReducer.class);
    job.setReducerClass(UnionReducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Set input and output
    job.setInputFormat(ShapeLineInputFormat.class);
    TextInputFormat.addInputPath(job, shapeFile);
    DistributedCache.addCacheFile(categoryFile.toUri(), job);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, output);

    // Start job
    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.ConvexHull.java

License:Open Source License

public static void convexHullMapReduce(Path inFile, Path userOutPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, ConvexHull.class);
    Path outPath = userOutPath;/*from   w w w  . ja va  2  s .  c o  m*/
    FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job);
    Shape shape = params.getShape("shape");

    if (outPath == null) {
        do {
            outPath = new Path(inFile.toUri().getPath() + ".convex_hull_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outPath));
    } else {
        if (outFs.exists(outPath)) {
            if (params.getBoolean("overwrite", false)) {
                outFs.delete(outPath, true);
            } else {
                throw new RuntimeException("Output path already exists and -overwrite flag is not set");
            }
        }
    }

    job.setJobName("ConvexHull");
    job.setClass(SpatialSite.FilterClass, ConvexHullFilter.class, BlockFilter.class);
    job.setMapperClass(IdentityMapper.class);
    job.setCombinerClass(ConvexHullReducer.class);
    job.setReducerClass(ConvexHullReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(shape.getClass());
    job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    job.setOutputFormat(GridOutputFormat2.class);
    GridOutputFormat2.setOutputPath(job, outPath);

    JobClient.runJob(job);

    // If outputPath not set by user, automatically delete it
    if (userOutPath == null)
        outFs.delete(outPath, true);
}