Example usage for org.apache.hadoop.mapred JobConf setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the user-defined combiner class used to combine map-outputs before being sent to the reducers.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java

License:Open Source License

/**
 * Computes the MBR of the input file using an aggregate MapReduce job.
 * /*from ww  w.ja v a2 s. com*/
 * @param inFile - Path to input file
 * @param params - Additional operation parameters
 * @return
 * @throws IOException
 * @throws InterruptedException 
 */
private static <S extends Shape> Partition fileMBRMapReduce(Path[] inFiles, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, FileMBR.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(inFiles[0].getName() + ".mbr_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("FileMBR");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Partition.class);

    job.setMapperClass(FileMBRMapper.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Combine.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, inFiles);
    TextOutputFormat.setOutputPath(job, outputPath);
    job.setOutputCommitter(MBROutputCommitter.class);

    // Submit the job
    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
        // Use multithreading too
        job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());
    }

    if (params.getBoolean("background", false)) {
        JobClient jc = new JobClient(job);
        lastSubmittedJob = jc.submitJob(job);
        return null;
    } else {
        lastSubmittedJob = JobClient.runJob(job);
        Counters counters = lastSubmittedJob.getCounters();
        Counter outputSizeCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES);
        sizeOfLastProcessedFile = outputSizeCounter.getCounter();

        FileStatus[] outFiles = outFs.listStatus(outputPath, SpatialSite.NonHiddenFileFilter);
        Partition mbr = new Partition();
        mbr.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
        OperationsParams localMBRParams = new OperationsParams(params);
        localMBRParams.setBoolean("local", true); // Enforce local execution
        localMBRParams.setClass("shape", Partition.class, Shape.class);
        for (FileStatus outFile : outFiles) {
            if (outFile.isDir())
                continue;
            ShapeRecordReader<Partition> reader = new ShapeRecordReader<Partition>(localMBRParams,
                    new FileSplit(outFile.getPath(), 0, outFile.getLen(), new String[0]));
            Rectangle key = reader.createKey();
            Partition p = reader.createValue();
            while (reader.next(key, p)) {
                mbr.expand(p);
            }
            reader.close();
        }

        outFs.delete(outputPath, true);
        return mbr;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.RecordCount.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing//from   w ww.j a  v a2 s .c om
 * @param fs
 * @param inFile
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
public static long recordCountMapReduce(FileSystem fs, Path inFile) throws IOException, InterruptedException {
    JobConf job = new JobConf(RecordCount.class);

    Path outputPath = new Path(inFile.toUri().getPath() + ".linecount");
    FileSystem outFs = outputPath.getFileSystem(job);
    outFs.delete(outputPath, true);

    job.setJobName("LineCount");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Reduce.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(1);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, inFile);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);

    // Read job result
    if (OperationsParams.isLocal(job, inFile)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
        // Use multithreading too
        job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());
    }

    long lineCount = 0;
    FileStatus[] results = outFs.listStatus(outputPath);
    for (FileStatus fileStatus : results) {
        if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
            LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
            Text text = new Text();
            if (lineReader.readLine(text) > 0) {
                lineCount = Long.parseLong(text.toString());
            }
            lineReader.close();
        }
    }

    outFs.delete(outputPath, true);

    return lineCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Skyline.java

License:Open Source License

private static void skylineMapReduce(Path inFile, Path userOutPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, Skyline.class);
    Path outPath = userOutPath;/*from   w  w w  .j av a 2 s  .c  o  m*/
    FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job);
    Shape shape = params.getShape("shape");

    if (outPath == null) {
        do {
            outPath = new Path(inFile.toUri().getPath() + ".skyline_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outPath));
    }

    job.setJobName("Skyline");
    job.setClass(SpatialSite.FilterClass, SkylineFilter.class, BlockFilter.class);
    job.setMapperClass(IdentityMapper.class);
    job.setCombinerClass(SkylineReducer.class);
    job.setReducerClass(SkylineReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(shape.getClass());
    job.setInputFormat(ShapeIterInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outPath);

    JobClient.runJob(job);

    // If outputPath not set by user, automatically delete it
    if (userOutPath == null)
        outFs.delete(outPath, true);
}

From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskLargeHDFS.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(getConf(), this.getClass());
    conf.setJobName("aggregation_hdfs_large");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(AggTaskLargeHDFS.Map.class);
    conf.setCombinerClass(AggTaskLargeHDFS.Reduce.class);
    conf.setReducerClass(AggTaskLargeHDFS.Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 2) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }//  w  ww.j a v a  2 s.co m

    FileInputFormat.setInputPaths(conf, new Path(args[0]));

    // OUTPUT properties
    Path outputPath = new Path(args[1]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskSmallHDFS.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("aggregation_hdfs_small");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(AggTaskSmallHDFS.Map.class);
    conf.setCombinerClass(AggTaskSmallHDFS.Reduce.class);
    conf.setReducerClass(AggTaskSmallHDFS.Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 2) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }//  w  w  w.  j  a  va2s.c om

    FileInputFormat.setInputPaths(conf, new Path(args[0]));

    // OUTPUT properties
    Path outputPath = new Path(args[1]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.UDFAggTaskDB.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("udf_agg_db");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setMapperClass(Query4Map.class);
    conf.setCombinerClass(LongSumReducer.class);
    conf.setReducerClass(LongSumReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 1) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }//from  w ww .  j a  va2 s.co m

    // OUTPUT properties
    Path outputPath = new Path(args[0]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set(DBConst.DB_RELATION_ID, "Documents");
    conf.set(DBConst.DB_RECORD_READER, DocumentRecord.class.getName());
    conf.set(DBConst.DB_SQL_QUERY, "SELECT url, contents FROM Documents;");

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.UDFAggTaskHDFS.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("udf_agg_hdfs");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(LongSumReducer.class);
    conf.setReducerClass(LongSumReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 2) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }//from  ww  w. j ava  2 s  .  c om

    FileInputFormat.setInputPaths(conf, new Path(args[0]));

    // OUTPUT properties
    Path outputPath = new Path(args[1]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    return conf;

}

From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java

License:Apache License

/**
 * Create a JobConf for a Job that will calculate the number of unique listeners per track.
 * //from  w  ww  .  j  ava  2 s  . com
 * @param inputDir The path to the folder containing the raw listening data files.
 * @return The unique listeners JobConf.
 */
private JobConf getUniqueListenersJobConf(Path inputDir) {
    log.info("Creating configuration for unique listeners Job");

    // output results to a temporary intermediate folder, this will get deleted by start() method
    Path uniqueListenersOutput = new Path("uniqueListeners");

    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(IntWritable.class); // number of unique listeners
    conf.setInputFormat(TextInputFormat.class); // raw listening data
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapperClass(UniqueListenersMapper.class);
    conf.setCombinerClass(UniqueListenersCombiner.class);
    conf.setReducerClass(UniqueListenersReducer.class);

    FileInputFormat.addInputPath(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, uniqueListenersOutput);
    conf.setJobName("uniqueListeners");
    return conf;
}

From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java

License:Apache License

/**
 * Creates a JobConf for a Job that will sum up the TrackStatistics per track.
 * //w  ww .j av  a2  s  .co  m
 * @param inputDir The path to the folder containing the raw input data files.
 * @return The sum JobConf.
 */
private JobConf getSumJobConf(Path inputDir) {
    log.info("Creating configuration for sum job");
    // output results to a temporary intermediate folder, this will get deleted by start() method
    Path playsOutput = new Path("sum");

    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(TrackStats.class); // statistics for a track
    conf.setInputFormat(TextInputFormat.class); // raw listening data
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapperClass(SumMapper.class);
    conf.setCombinerClass(SumReducer.class);
    conf.setReducerClass(SumReducer.class);

    FileInputFormat.addInputPath(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, playsOutput);
    conf.setJobName("sum");
    return conf;
}

From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java

License:Apache License

/**
 * Creates a JobConf for a Job that will merge the unique listeners and track statistics.
 * /*from  ww  w  .  j  a  v  a 2 s.  co m*/
 * @param outputPath The path for the results to be output to.
 * @param sumInputDir The path containing the data from the sum Job.
 * @param listenersInputDir The path containing the data from the unique listeners job.
 * @return The merge JobConf.
 */
private JobConf getMergeConf(Path outputPath, Path sumInputDir, Path listenersInputDir) {
    log.info("Creating configuration for merge job");
    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(TrackStats.class); // overall track statistics
    conf.setCombinerClass(SumReducer.class); // safe to re-use reducer as a combiner here
    conf.setReducerClass(SumReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(conf, outputPath);

    MultipleInputs.addInputPath(conf, sumInputDir, SequenceFileInputFormat.class, IdentityMapper.class);
    MultipleInputs.addInputPath(conf, listenersInputDir, SequenceFileInputFormat.class,
            MergeListenersMapper.class);
    conf.setJobName("merge");
    return conf;
}