Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n)

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Calculates the union of a set of shapes categorized by some user defined
 * category.//from  w  w  w  . j  av a2 s. c  o  m
 * @param shapeFile - Input file that contains shapes
 * @param categoryFile - Category file that contains the category of each
 *  shape.Shapes not appearing in this file are not generated in output.
 * @param output - An output file that contains each category and the union
 *  of all shapes in it. Each line contains the category, then a comma,
 *   then the union represented as text.
 * @throws IOException
 */
public static void unionMapReduce(Path shapeFile, Path categoryFile, Path output, OperationsParams params)
        throws IOException {

    JobConf job = new JobConf(params, CatUnion.class);
    job.setJobName("Union");

    // Check output file existence
    FileSystem outFs = output.getFileSystem(job);
    if (outFs.exists(output)) {
        if (params.getBoolean("overwrite", false)) {
            outFs.delete(output, true);
        } else {
            throw new RuntimeException("Output path already exists and -overwrite flag is not set");
        }
    }

    // Set map and reduce
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));

    job.setMapperClass(UnionMapper.class);
    job.setCombinerClass(UnionReducer.class);
    job.setReducerClass(UnionReducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Set input and output
    job.setInputFormat(ShapeLineInputFormat.class);
    TextInputFormat.addInputPath(job, shapeFile);
    DistributedCache.addCacheFile(categoryFile.toUri(), job);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, output);

    // Start job
    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.Contains.java

License:Open Source License

public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Contains.class);

    LOG.info("Contains journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   w  w  w .  ja  v  a  2 s.  c  o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(ContainsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(ContainsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java

License:Open Source License

public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Crosses.class);

    LOG.info("Crosses journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/* www  . j a  va 2  s.  c o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Crosses");
    job.setMapperClass(CrossesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(CrossesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java

License:Open Source License

public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Disjoint.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*ww w  .j  a va 2  s.  com*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Disjoint");
    job.setMapperClass(DisjointMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(DisjointReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedCopy.java

License:Open Source License

private static void distributedCopy(Path inputPath, Path outputPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, DistributedCopy.class);
    job.setJobName("distcp3");
    // Set input/*from   w ww  . j a  v  a  2 s. c  o  m*/
    job.setInputFormat(BlockInputFormat.class);
    BlockInputFormat.addInputPath(job, inputPath);

    // Set output
    job.setOutputFormat(BlockOutputFormat.class);
    BlockOutputFormat.setOutputPath(job, outputPath);
    job.setOutputCommitter(BlockOutputCommitter.class);

    // Set number of mappers/reducers
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(0);

    // Run the job
    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Performs a redistribute join between the given files using the
* redistribute join algorithm. Currently, we only support a pair of files.
 * @param inFiles//from  w  w w .j  a  v  a  2 s .  c o  m
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 */
public static <S extends Shape> long joinStep(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException {
    long t1 = System.currentTimeMillis();

    JobConf job = new JobConf(params, DistributedJoin.class);

    FileSystem fs[] = new FileSystem[inFiles.length];
    for (int i_file = 0; i_file < inFiles.length; i_file++)
        fs[i_file] = inFiles[i_file].getFileSystem(job);

    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (fs[0].exists(outputPath));
    }

    job.setJobName("DistributedJoin");
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    GlobalIndex<Partition> gindex1 = SpatialSite.getGlobalIndex(fs[0], inFiles[0]);
    GlobalIndex<Partition> gindex2 = SpatialSite.getGlobalIndex(fs[1], inFiles[1]);

    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    LOG.info("Joining " + inFiles[0] + " X " + inFiles[1]);

    if (SpatialSite.isRTree(fs[0], inFiles[0]) && SpatialSite.isRTree(fs[1], inFiles[1])) {
        job.setInputFormat(DJInputFormatRTree.class);
    } else {
        if (isOneShotReadMode) {
            // Ensure all objects are read in one shot
            job.setInt(SpatialSite.MaxBytesInOneRead, -1);
            job.setInt(SpatialSite.MaxShapesInOneRead, -1);
        } else {
            job.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead);
            job.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead);
        }
        job.setInputFormat(DJInputFormatArray.class);
    }

    // Set input paths and map function
    if (inFiles[0].equals(inFiles[1])) {
        // Self join
        job.setInputFormat(ShapeArrayInputFormat.class);
        // Remove the spatial filter to ensure all partitions are loaded
        FileInputFormat.setInputPaths(job, inFiles[0]);
        if (gindex1 != null && gindex1.isReplicated())
            job.setMapperClass(RedistributeJoinMap.class);
        else
            job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class);
    } else {
        // Binary version of spatial join (two different input files)
        job.setClass(SpatialSite.FilterClass, SpatialJoinFilter.class, BlockFilter.class);
        FileInputFormat.setInputPaths(job, inFiles);
        if ((gindex1 != null && gindex1.isReplicated()) || (gindex2 != null && gindex2.isReplicated())) {
            // Need the map function with duplicate avoidance step.
            job.setMapperClass(RedistributeJoinMap.class);
        } else {
            // No replication in both indexes, use map function with no dup
            // avoidance
            job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class);
        }
    }

    Shape shape = params.getShape("shape");
    job.setMapOutputKeyClass(shape.getClass());
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setNumReduceTasks(0); // No reduce needed for this task

    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (!params.getBoolean("background", false)) {
        LOG.info("Submit job in sync mode");
        RunningJob runningJob = JobClient.runJob(job);
        Counters counters = runningJob.getCounters();
        Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
        final long resultCount = outputRecordCounter.getValue();

        // Output number of running map tasks
        Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
        System.out.println("Number of map tasks " + mapTaskCountCounter.getValue());

        // Delete output directory if not explicitly set by user
        if (userOutputPath == null)
            fs[0].delete(outputPath, true);
        long t2 = System.currentTimeMillis();
        System.out.println("Join time " + (t2 - t1) + " millis");

        return resultCount;
    } else {
        JobClient jc = new JobClient(job);
        LOG.info("Submit job in async mode");
        lastRunningJob = jc.submitJob(job);
        LOG.info("Job " + lastRunningJob + " submitted successfully");
        return -1;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two datasets by repartitioning the smaller dataset based
 * on the larger one, then apply one-to-one joining for each partition
 * //  w  w w .j a v a2  s  .  co m
 * @author Ibrahim Sabek
 * @param inputFiles
 *            Input datasets to be spatially joined
 * @param fileToRepartition
 *            Index of which file will be repartitioned
 * @param outputFile
 *            Output file contains the joining results
 * @param params
 *            Job configurations
 * @return
 * @throws IOException
 */
protected static long repartitionJoinStep(final Path[] inputFiles, int fileToRepartition, Path outputFile,
        OperationsParams params) throws IOException {

    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    // Do the repartition step
    long t1 = System.currentTimeMillis();

    JobConf repartitionJoinJob = new JobConf(params, DistributedJoin.class);
    repartitionJoinJob.setJobName("RepartitionJoin");

    FileSystem fs = inputFiles[fileToRepartition].getFileSystem(params);

    Path outputPath = outputFile;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (fs.exists(outputPath));
    }

    LOG.info("Repartition - Joining " + inputFiles[0] + " X " + inputFiles[1]);

    // Get the cells to use for repartitioning
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFiles[1 - fileToRepartition]);
    OperationsParams.setRepartitionJoinIndexPath(repartitionJoinJob, RepartitionJoinIndexPath,
            inputFiles[1 - fileToRepartition]);
    OperationsParams.setInactiveModeFlag(repartitionJoinJob, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(repartitionJoinJob, JoiningThresholdPerOnce,
            joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(repartitionJoinJob, isFilterOnlyMode, isFilterOnly);
    CellInfo[] cellsInfo = SpatialSite.cellsOf(fs, inputFiles[1 - fileToRepartition]);

    // Repartition the file to match the other file
    boolean isReplicated = gindex.isReplicated();
    boolean isCompact = gindex.isCompact();
    String sindex;
    if (isReplicated && !isCompact)
        sindex = "grid";
    else if (isReplicated && isCompact)
        sindex = "r+tree";
    else if (!isReplicated && isCompact)
        sindex = "rtree";
    else
        throw new RuntimeException("Unknown index at: " + inputFiles[1 - fileToRepartition]);
    params.set("sindex", sindex);

    // Decide which map function to use based on the type of global index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        repartitionJoinJob.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        repartitionJoinJob.setMapperClass(RepartitionMap.class);
    }
    repartitionJoinJob.setMapOutputKeyClass(IntWritable.class);
    repartitionJoinJob.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(repartitionJoinJob, inputFiles[fileToRepartition]);
    repartitionJoinJob.setInputFormat(ShapeInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(repartitionJoinJob).getClusterStatus();
    repartitionJoinJob.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setCells(repartitionJoinJob, cellsInfo);
    repartitionJoinJob.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // set reduce function
    repartitionJoinJob.setReducerClass(RepartitionJoinReduce.class);
    repartitionJoinJob.setNumReduceTasks(
            Math.max(1, Math.min(cellsInfo.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    repartitionJoinJob.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(repartitionJoinJob, outputPath);

    RunningJob runningJob = JobClient.runJob(repartitionJoinJob);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    // Output number of running map tasks
    Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
    System.out.println("Number of map tasks " + mapTaskCountCounter.getValue());

    // Delete output directory if not explicitly set by user
    if (outputFile == null)
        fs.delete(outputPath, true);
    long t2 = System.currentTimeMillis();
    System.out.println("Repartitioning and Joining time " + (t2 - t1) + " millis");

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Equals.java

License:Open Source License

public static <S extends Shape> long equals(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Equals.class);

    LOG.info("Equals journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*  ww w  .  j  a  v a 2 s.c om*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Equals");
    job.setMapperClass(EqualsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(EqualsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Indexer.java

License:Open Source License

private static RunningJob indexMapReduce(Path inPath, Path outPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Indexer.class);
    job.setJobName("Indexer");

    // Set input file MBR if not already set
    Rectangle inputMBR = (Rectangle) params.getShape("mbr");
    if (inputMBR == null)
        inputMBR = FileMBR.fileMBR(inPath, params);
    OperationsParams.setShape(job, "mbr", inputMBR);

    // Set input and output
    job.setInputFormat(ShapeIterInputFormat.class);
    ShapeIterInputFormat.setInputPaths(job, inPath);
    job.setOutputFormat(IndexOutputFormat.class);
    GridOutputFormat.setOutputPath(job, outPath);

    // Set the correct partitioner according to index type
    String index = job.get("sindex");
    if (index == null)
        throw new RuntimeException("Index type is not set");
    long t1 = System.currentTimeMillis();
    Partitioner partitioner = createPartitioner(inPath, outPath, job, index);
    Partitioner.setPartitioner(job, partitioner);
    long t2 = System.currentTimeMillis();
    System.out.println("Total time for space subdivision in millis: " + (t2 - t1));

    // Set mapper and reducer
    Shape shape = params.getShape("shape");
    job.setMapperClass(IndexMethods.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(shape.getClass());
    job.setReducerClass(IndexMethods.class);
    job.setOutputCommitter(IndexerOutputCommitter.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    // Use multithreading in case the job is running locally
    job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());

    // Start the job
    if (params.getBoolean("background", false)) {
        // Run in background
        JobClient jc = new JobClient(job);
        return jc.submitJob(job);
    } else {/*from w  w  w.  ja  va2  s .  c o  m*/
        // Run and block until it is finished
        return JobClient.runJob(job);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Intersects.java

License:Open Source License

public static <S extends Shape> long intersects(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Intersects.class);

    LOG.info("Intersects journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  www.  j a va 2s  .  c o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Intersects");
    job.setMapperClass(IntersectsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(IntersectsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}