Example usage for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

public static void repartitionMapReduce(Path inFile, Path outPath, CellInfo[] cellInfos,
        OperationsParams params) throws IOException, InterruptedException {
    String sindex = params.get("sindex");
    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    FileSystem outFs = outPath.getFileSystem(params);

    // Calculate number of partitions in output file
    // Copy blocksize from source file if it's globally indexed
    @SuppressWarnings("deprecation")
    final long blockSize = outFs.getDefaultBlockSize();

    // Calculate the dimensions of each partition based on gindex type
    if (cellInfos == null) {
        if (sindex.equals("grid")) {
            Rectangle input_mbr = FileMBR.fileMBR(inFile, params);
            long inFileSize = FileMBR.sizeOfLastProcessedFile;
            int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outPath,
                    blockSize);//ww w  . j a  va 2  s  . c  o  m

            GridInfo gridInfo = new GridInfo(input_mbr.x1, input_mbr.y1, input_mbr.x2, input_mbr.y2);
            gridInfo.calculateCellDimensions(num_partitions);
            cellInfos = gridInfo.getAllCells();
        } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
                || sindex.equals("str+")) {
            // Pack in rectangles using an RTree
            cellInfos = packInRectangles(inFile, outPath, params);
        } else {
            throw new RuntimeException("Unsupported spatial index: " + sindex);
        }
    }

    JobConf job = new JobConf(params, Repartition.class);

    job.setJobName("Repartition");

    // Overwrite output file
    if (outFs.exists(outPath)) {
        if (overwrite)
            outFs.delete(outPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid, str+, and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(job, inFile);
    job.setInputFormat(ShapeInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);

}

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

/**
 * Repartitions an input file according to the given list of cells.
 * @param inFile The input raw file that needs to be indexed.
 * @param outPath The output path where the index will be written.
 * @param stockShape An instance of the shapes stored in the input file.
 * @param blockSize The block size for the constructed index.
 * @param cellInfos A predefined set of cells to use as a global index
 * @param sindex The type of index to build.
 * @param overwrite Whether to overwrite the output or not.
 * @throws IOException If an exception happens while preparing the job.
 *///from  ww  w .j a va 2 s. co  m
public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException {
    JobConf job = new JobConf(Repartition.class);

    job.setJobName("Repartition");
    FileSystem outFs = outPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outPath)) {
        if (overwrite)
            outFs.delete(outPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(job, inFile);
    job.setInputFormat(ShapeInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    if (blockSize != 0) {
        job.setLong("dfs.block.size", blockSize);
        job.setLong("fs.local.block.size", blockSize);
    }

    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static <T extends TextSerializable> int sampleMapReduceWithRatio(Path[] files,
        final ResultCollector<T> output, OperationsParams params) throws IOException {
    JobConf job = new JobConf(params, Sampler.class);

    Path outputPath;/*from   ww  w .  j  a v a  2 s  .c  o m*/
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Sample");
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    // Number of reduces can be set to zero. However, setting it to a reasonable
    // number ensures that number of output files is limited to that number
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    RunningJob run_job = JobClient.runJob(job);

    Counters counters = run_job.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    Counter outputSizeConter = counters.findCounter(Task.Counter.MAP_OUTPUT_BYTES);
    final long sampleSize = outputSizeConter.getValue();

    LOG.info("resultSize: " + sampleSize);
    LOG.info("resultCount: " + resultCount);

    Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES);
    Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue();

    // Ratio of records to return from output based on the threshold
    // Note that any number greater than or equal to one will cause all
    // elements to be returned
    long desiredSampleSize = job.getLong("size", 0);
    // Fraction of drawn sample to return
    float selectRatio = desiredSampleSize <= 0 ? 2.0f : (float) desiredSampleSize / sampleSize;

    // Read job result
    int result_size = 0;
    if (selectRatio > 1.0f) {
        // Return all records from the output
        ShapeLineInputFormat inputFormat = new ShapeLineInputFormat();
        ShapeLineInputFormat.setInputPaths(job, outputPath);
        InputSplit[] splits = inputFormat.getSplits(job, 1);
        for (InputSplit split : splits) {
            RecordReader<Rectangle, Text> reader = inputFormat.getRecordReader(split, job, null);
            Rectangle key = reader.createKey();
            Text value = reader.createValue();
            T outObj = (T) OperationsParams.getTextSerializable(params, "outshape", new Text2());
            while (reader.next(key, value)) {
                outObj.fromText(value);
                output.collect(outObj);
            }
            reader.close();
        }
    } else {
        if (output != null) {
            OperationsParams params2 = new OperationsParams(params);
            params2.setFloat("ratio", selectRatio);
            params2.set("shape", params.get("outshape"));
            params2.set("outshape", params.get("outshape"));
            if (selectRatio > 0.1) {
                LOG.info("Local return " + selectRatio + " of " + resultCount + " records");
                // Keep a copy of sizeOfLastProcessedFile because we don't want it changed
                long tempSize = sizeOfLastProcessedFile;
                // Return a (small) ratio of the result using a MapReduce job
                // In this case, the files are very big and we need just a small ratio
                // of them. It is better to do it in parallel
                result_size = sampleLocalWithRatio(new Path[] { outputPath }, output, params2);
                sizeOfLastProcessedFile = tempSize;
            } else {
                LOG.info("MapReduce return " + selectRatio + " of " + resultCount + " records");
                // Keep a copy of sizeOfLastProcessedFile because we don't want it changed
                long tempSize = sizeOfLastProcessedFile;
                // Return a (small) ratio of the result using a MapReduce job
                // In this case, the files are very big and we need just a small ratio
                // of them. It is better to do it in parallel
                result_size = sampleMapReduceWithRatio(new Path[] { outputPath }, output, params2);
                sizeOfLastProcessedFile = tempSize;
            }
        }
    }

    outFs.delete(outputPath, true);

    return result_size;
}

From source file:edu.umn.cs.spatialHadoop.operations.Shuffle.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing//from   w  w  w . j a v a  2s.c o  m
 * @param infile
 * @param outfile
 * @param params
 * @throws IOException
 */
public static void randomizerMapReduce(Path infile, Path outfile, OperationsParams params) throws IOException {
    JobConf job = new JobConf(Shuffle.class);

    job.setJobName("Randomizer");
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setReducerClass(Reduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInt(NumOfPartitions, Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(TextInputFormat.class);
    TextInputFormat.setInputPaths(job, infile);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outfile);

    // Submit the job
    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.SJMR.java

License:Open Source License

public static <S extends Shape> long sjmr(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, SJMR.class);

    LOG.info("SJMR journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/* w w w.  ja v a2  s .c om*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("SJMR");
    job.setMapperClass(SJMRMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(SJMRReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Skyline.java

License:Open Source License

private static void skylineMapReduce(Path inFile, Path userOutPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, Skyline.class);
    Path outPath = userOutPath;//w  ww  .  ja va2 s .  com
    FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job);
    Shape shape = params.getShape("shape");

    if (outPath == null) {
        do {
            outPath = new Path(inFile.toUri().getPath() + ".skyline_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outPath));
    }

    job.setJobName("Skyline");
    job.setClass(SpatialSite.FilterClass, SkylineFilter.class, BlockFilter.class);
    job.setMapperClass(IdentityMapper.class);
    job.setCombinerClass(SkylineReducer.class);
    job.setReducerClass(SkylineReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(shape.getClass());
    job.setInputFormat(ShapeIterInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outPath);

    JobClient.runJob(job);

    // If outputPath not set by user, automatically delete it
    if (userOutPath == null)
        outFs.delete(outPath, true);
}

From source file:edu.umn.cs.spatialHadoop.operations.Touches.java

License:Open Source License

public static <S extends Shape> long touches(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Touches.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w  w w. ja v  a  2  s.  c om*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Touches");
    job.setMapperClass(TouchesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(TouchesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Within.java

License:Open Source License

public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Within.class);

    LOG.info("Within journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {//  w  w  w .jav  a2 s  .c  om
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(WithinMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(WithinReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.RandomSpatialGenerator.java

License:Open Source License

private static void generateMapReduce(Path outFile, OperationsParams params) throws IOException {
    JobConf job = new JobConf(params, RandomSpatialGenerator.class);
    job.setJobName("Generator");
    Shape shape = params.getShape("shape");

    FileSystem outFs = outFile.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    // Set input format and map class
    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Repartition.RepartitionMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    String sindex = params.get("sindex");
    Rectangle mbr = params.getShape("mbr").getMBR();

    CellInfo[] cells;//  w  ww  .ja va2s. c o m
    if (sindex == null) {
        cells = new CellInfo[] { new CellInfo(1, mbr) };
    } else if (sindex.equals("grid")) {
        GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
        FileSystem fs = outFile.getFileSystem(job);
        long blocksize = fs.getDefaultBlockSize(outFile);
        long size = params.getSize("size");
        int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, outFile, blocksize);
        gridInfo.calculateCellDimensions(numOfCells);
        cells = gridInfo.getAllCells();
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cells);

    // Do not set a reduce function. Use the default identity reduce function
    if (cells.length == 1) {
        // All objects are in one partition. No need for a reduce phase
        job.setNumReduceTasks(0);
    } else {
        // More than one partition. Need a reduce phase to group shapes of the
        // same partition together
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, outFile);
    if (sindex == null || sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobClient.runJob(job);

    // TODO move the following part to OutputCommitter
    // Concatenate all master files into one file
    FileStatus[] resultFiles = outFs.listStatus(outFile, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    String ext = resultFiles[0].getPath().getName()
            .substring(resultFiles[0].getPath().getName().lastIndexOf('.'));
    Path masterPath = new Path(outFile, "_master" + ext);
    OutputStream destOut = outFs.create(masterPath);
    byte[] buffer = new byte[4096];
    for (FileStatus f : resultFiles) {
        InputStream in = outFs.open(f.getPath());
        int bytes_read;
        do {
            bytes_read = in.read(buffer);
            if (bytes_read > 0)
                destOut.write(buffer, 0, bytes_read);
        } while (bytes_read > 0);
        in.close();
        outFs.delete(f.getPath(), false);
    }
    destOut.close();
}

From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

License:Apache License

public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, OperationsParams params)
        throws IOException, InterruptedException {
    String sindex = params.get("sindex");
    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    FileSystem outFs = outputPath.getFileSystem(params);

    @SuppressWarnings("deprecation")
    final long blockSize = outFs.getDefaultBlockSize();

    // Calculate the dimensions of each partition based on gindex type
    CellInfo[] cellInfos;//w  w  w. j a  va  2s .c  o m
    if (sindex.equals("grid")) {
        Rectangle inputMBR = FileMBR.fileMBR(inputPaths[0], params);
        long inputFileSize = FileMBR.sizeOfLastProcessedFile;
        for (int i = 1; i < inputPaths.length; i++) {
            Rectangle currentInputMBR = FileMBR.fileMBR(inputPaths[i], params);
            inputMBR.expand(currentInputMBR);
            inputFileSize = inputFileSize + FileMBR.sizeOfLastProcessedFile;
        }

        int num_partitions = calculateNumberOfPartitions(new Configuration(), inputFileSize, outFs, outputPath,
                blockSize);

        GridInfo gridInfo = new GridInfo(inputMBR.x1, inputMBR.y1, inputMBR.x2, inputMBR.y2);
        gridInfo.calculateCellDimensions(num_partitions);
        cellInfos = gridInfo.getAllCells();
    } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
            || sindex.equals("str+")) {
        // Pack in rectangles using an RTree
        cellInfos = packInRectangles(inputPaths, outputPath, params, null);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobConf job = new JobConf(params, RepartitionTemporal.class);
    job.setJobName("RepartitionTemporal");

    // Overwrite output file
    if (outFs.exists(outputPath)) {
        if (overwrite)
            outFs.delete(outputPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outputPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global
    // index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid, str+, and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
    job.setInputFormat(CombinedSpatialInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outputPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);

}