Example usage for org.apache.hadoop.mapred JobConf setNumMapTasks

List of usage examples for org.apache.hadoop.mapred JobConf setNumMapTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumMapTasks.

Prototype

public void setNumMapTasks(int n) 

Source Link

Document

Set the number of map tasks for this job.

Usage

From source file:edu.umn.cs.spatialHadoop.nasa.DistributedAggregateSpatioTemporalIndexer.java

License:Open Source License

/**
 * Build a bunch of AggregateQuadTrees using a Map-Reduce job
 * /*from   w  ww.j  ava  2s .  c  o m*/
 * @param inputPathsDictionaryPath
 * @param params
 * @throws IOException
 */
public static void aggregateQuadTreeMapReduce(Path inputPathsDictionaryPath, OperationsParams params)
        throws IOException {

    // configure a map-reduce job
    JobConf job = new JobConf(params, DistributedAggregateSpatioTemporalIndexer.class);

    Path outputPath;
    String outputPathPrefix = "aggQuadTree_";
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(outputPathPrefix + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("AggregateQuadTree");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(AggregateQuadTreeMaper.class);
    job.set(HDFSIndexPath, hdfsIndexPath.toString());

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    TextInputFormat.setInputPaths(job, inputPathsDictionaryPath);
    TextOutputFormat.setOutputPath(job, outputPath);

    if (job.getBoolean("local", false)) {
        // Enforce local execution if explicitly set by user or for small
        // files
        job.set("mapred.job.tracker", "local");
        // Use multithreading too
        job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, 16);
    }
    job.setNumReduceTasks(0);

    // Submit the job
    JobClient.runJob(job);

    outFs.delete(outputPath, true);
}

From source file:edu.umn.cs.spatialHadoop.operations.Aggregate.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing/*from ww w.ja va 2 s.co  m*/
 * @param files
 * @param params
 * @return
 * @throws IOException
 */
public static MinMax aggregateMapReduce(Path[] files, OperationsParams params) throws IOException {
    Shape plotRange = params.getShape("rect");
    JobConf job = new JobConf(params, Aggregate.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path("agg_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Aggregate");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(MinMax.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeInputFormat.class);
    job.setClass("shape", NASAPoint.class, Shape.class);
    if (plotRange != null) {
        job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    }

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);

    // Read job result
    FileStatus[] results = outFs.listStatus(outputPath);
    MinMax minMax = new MinMax();
    for (FileStatus status : results) {
        if (status.getLen() > 0 && status.getPath().getName().startsWith("part-")) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(outFs.open(status.getPath())));
            String line;
            MinMax value = new MinMax();
            while ((line = reader.readLine()) != null) {
                value.fromText(new Text(line));
                minMax.expand(value);
            }
            reader.close();
        }
    }

    outFs.delete(outputPath, true);

    return minMax;
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Calculates the union of a set of shapes categorized by some user defined
 * category./*from  www  .j  a v  a2 s . c  om*/
 * @param shapeFile - Input file that contains shapes
 * @param categoryFile - Category file that contains the category of each
 *  shape.Shapes not appearing in this file are not generated in output.
 * @param output - An output file that contains each category and the union
 *  of all shapes in it. Each line contains the category, then a comma,
 *   then the union represented as text.
 * @throws IOException
 */
public static void unionMapReduce(Path shapeFile, Path categoryFile, Path output, OperationsParams params)
        throws IOException {

    JobConf job = new JobConf(params, CatUnion.class);
    job.setJobName("Union");

    // Check output file existence
    FileSystem outFs = output.getFileSystem(job);
    if (outFs.exists(output)) {
        if (params.getBoolean("overwrite", false)) {
            outFs.delete(output, true);
        } else {
            throw new RuntimeException("Output path already exists and -overwrite flag is not set");
        }
    }

    // Set map and reduce
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));

    job.setMapperClass(UnionMapper.class);
    job.setCombinerClass(UnionReducer.class);
    job.setReducerClass(UnionReducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Set input and output
    job.setInputFormat(ShapeLineInputFormat.class);
    TextInputFormat.addInputPath(job, shapeFile);
    DistributedCache.addCacheFile(categoryFile.toUri(), job);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, output);

    // Start job
    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.ClosestPairHadoop.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing/*  w w w. j  a v a2 s.c o m*/
 * @param conf
 * @param fs
 * @param file
 * @return
 * @throws IOException 
 */
public static <S extends Shape> void cloesetPair(Path file, OperationsParams params) throws IOException {
    // Try to get file MBR from the MBRs of blocks
    JobConf job = new JobConf(params, ClosestPairHadoop.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(file.getName() + ".closest_pair_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));
    outFs.delete(outputPath, true);

    job.setJobName("ClosestPair");
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Point.class);

    job.setMapperClass(Map0.class);
    job.setReducerClass(Reduce0.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeArrayInputFormat.class);
    //      job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.setInputPaths(job, file);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);
    //////////////////////////////////////////////////////////////////////////

    System.out.println("Begin second round!");
    // 2nd Round
    job = new JobConf(params, ClosestPairHadoop.class);
    job.setJobName("Second Round");
    job.setOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Point.class);

    job.setMapperClass(Map1.class);
    job.setReducerClass(Reduce1.class);
    clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeArrayInputFormat.class);
    //      job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.setInputPaths(job, outputPath); // The previous output is the current input

    Path newPath = new Path(outputPath.getName() + "_result");
    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, newPath);

    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.Contains.java

License:Open Source License

public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Contains.class);

    LOG.info("Contains journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/* w  w  w  .  ja  v  a 2s.c  o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(ContainsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(ContainsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java

License:Open Source License

public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Crosses.class);

    LOG.info("Crosses journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {//from ww w . j a  va  2  s .  co  m
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Crosses");
    job.setMapperClass(CrossesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(CrossesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java

License:Open Source License

public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Disjoint.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   w w  w  .j  a va  2 s  .c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Disjoint");
    job.setMapperClass(DisjointMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(DisjointReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedCopy.java

License:Open Source License

private static void distributedCopy(Path inputPath, Path outputPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, DistributedCopy.class);
    job.setJobName("distcp3");
    // Set input/*from www.j a  v a 2 s.  c  o m*/
    job.setInputFormat(BlockInputFormat.class);
    BlockInputFormat.addInputPath(job, inputPath);

    // Set output
    job.setOutputFormat(BlockOutputFormat.class);
    BlockOutputFormat.setOutputPath(job, outputPath);
    job.setOutputCommitter(BlockOutputCommitter.class);

    // Set number of mappers/reducers
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(0);

    // Run the job
    JobClient.runJob(job);
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Performs a redistribute join between the given files using the
* redistribute join algorithm. Currently, we only support a pair of files.
 * @param inFiles//ww w . j ava2s . co  m
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 */
public static <S extends Shape> long joinStep(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException {
    long t1 = System.currentTimeMillis();

    JobConf job = new JobConf(params, DistributedJoin.class);

    FileSystem fs[] = new FileSystem[inFiles.length];
    for (int i_file = 0; i_file < inFiles.length; i_file++)
        fs[i_file] = inFiles[i_file].getFileSystem(job);

    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (fs[0].exists(outputPath));
    }

    job.setJobName("DistributedJoin");
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    GlobalIndex<Partition> gindex1 = SpatialSite.getGlobalIndex(fs[0], inFiles[0]);
    GlobalIndex<Partition> gindex2 = SpatialSite.getGlobalIndex(fs[1], inFiles[1]);

    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    LOG.info("Joining " + inFiles[0] + " X " + inFiles[1]);

    if (SpatialSite.isRTree(fs[0], inFiles[0]) && SpatialSite.isRTree(fs[1], inFiles[1])) {
        job.setInputFormat(DJInputFormatRTree.class);
    } else {
        if (isOneShotReadMode) {
            // Ensure all objects are read in one shot
            job.setInt(SpatialSite.MaxBytesInOneRead, -1);
            job.setInt(SpatialSite.MaxShapesInOneRead, -1);
        } else {
            job.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead);
            job.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead);
        }
        job.setInputFormat(DJInputFormatArray.class);
    }

    // Set input paths and map function
    if (inFiles[0].equals(inFiles[1])) {
        // Self join
        job.setInputFormat(ShapeArrayInputFormat.class);
        // Remove the spatial filter to ensure all partitions are loaded
        FileInputFormat.setInputPaths(job, inFiles[0]);
        if (gindex1 != null && gindex1.isReplicated())
            job.setMapperClass(RedistributeJoinMap.class);
        else
            job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class);
    } else {
        // Binary version of spatial join (two different input files)
        job.setClass(SpatialSite.FilterClass, SpatialJoinFilter.class, BlockFilter.class);
        FileInputFormat.setInputPaths(job, inFiles);
        if ((gindex1 != null && gindex1.isReplicated()) || (gindex2 != null && gindex2.isReplicated())) {
            // Need the map function with duplicate avoidance step.
            job.setMapperClass(RedistributeJoinMap.class);
        } else {
            // No replication in both indexes, use map function with no dup
            // avoidance
            job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class);
        }
    }

    Shape shape = params.getShape("shape");
    job.setMapOutputKeyClass(shape.getClass());
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setNumReduceTasks(0); // No reduce needed for this task

    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (!params.getBoolean("background", false)) {
        LOG.info("Submit job in sync mode");
        RunningJob runningJob = JobClient.runJob(job);
        Counters counters = runningJob.getCounters();
        Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
        final long resultCount = outputRecordCounter.getValue();

        // Output number of running map tasks
        Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
        System.out.println("Number of map tasks " + mapTaskCountCounter.getValue());

        // Delete output directory if not explicitly set by user
        if (userOutputPath == null)
            fs[0].delete(outputPath, true);
        long t2 = System.currentTimeMillis();
        System.out.println("Join time " + (t2 - t1) + " millis");

        return resultCount;
    } else {
        JobClient jc = new JobClient(job);
        LOG.info("Submit job in async mode");
        lastRunningJob = jc.submitJob(job);
        LOG.info("Job " + lastRunningJob + " submitted successfully");
        return -1;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two datasets by repartitioning the smaller dataset based
 * on the larger one, then apply one-to-one joining for each partition
 * // ww  w .  j  a v a 2  s . c  o  m
 * @author Ibrahim Sabek
 * @param inputFiles
 *            Input datasets to be spatially joined
 * @param fileToRepartition
 *            Index of which file will be repartitioned
 * @param outputFile
 *            Output file contains the joining results
 * @param params
 *            Job configurations
 * @return
 * @throws IOException
 */
protected static long repartitionJoinStep(final Path[] inputFiles, int fileToRepartition, Path outputFile,
        OperationsParams params) throws IOException {

    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    // Do the repartition step
    long t1 = System.currentTimeMillis();

    JobConf repartitionJoinJob = new JobConf(params, DistributedJoin.class);
    repartitionJoinJob.setJobName("RepartitionJoin");

    FileSystem fs = inputFiles[fileToRepartition].getFileSystem(params);

    Path outputPath = outputFile;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (fs.exists(outputPath));
    }

    LOG.info("Repartition - Joining " + inputFiles[0] + " X " + inputFiles[1]);

    // Get the cells to use for repartitioning
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFiles[1 - fileToRepartition]);
    OperationsParams.setRepartitionJoinIndexPath(repartitionJoinJob, RepartitionJoinIndexPath,
            inputFiles[1 - fileToRepartition]);
    OperationsParams.setInactiveModeFlag(repartitionJoinJob, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(repartitionJoinJob, JoiningThresholdPerOnce,
            joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(repartitionJoinJob, isFilterOnlyMode, isFilterOnly);
    CellInfo[] cellsInfo = SpatialSite.cellsOf(fs, inputFiles[1 - fileToRepartition]);

    // Repartition the file to match the other file
    boolean isReplicated = gindex.isReplicated();
    boolean isCompact = gindex.isCompact();
    String sindex;
    if (isReplicated && !isCompact)
        sindex = "grid";
    else if (isReplicated && isCompact)
        sindex = "r+tree";
    else if (!isReplicated && isCompact)
        sindex = "rtree";
    else
        throw new RuntimeException("Unknown index at: " + inputFiles[1 - fileToRepartition]);
    params.set("sindex", sindex);

    // Decide which map function to use based on the type of global index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        repartitionJoinJob.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        repartitionJoinJob.setMapperClass(RepartitionMap.class);
    }
    repartitionJoinJob.setMapOutputKeyClass(IntWritable.class);
    repartitionJoinJob.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(repartitionJoinJob, inputFiles[fileToRepartition]);
    repartitionJoinJob.setInputFormat(ShapeInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(repartitionJoinJob).getClusterStatus();
    repartitionJoinJob.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setCells(repartitionJoinJob, cellsInfo);
    repartitionJoinJob.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // set reduce function
    repartitionJoinJob.setReducerClass(RepartitionJoinReduce.class);
    repartitionJoinJob.setNumReduceTasks(
            Math.max(1, Math.min(cellsInfo.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    repartitionJoinJob.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(repartitionJoinJob, outputPath);

    RunningJob runningJob = JobClient.runJob(repartitionJoinJob);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    // Output number of running map tasks
    Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
    System.out.println("Number of map tasks " + mapTaskCountCounter.getValue());

    // Delete output directory if not explicitly set by user
    if (outputFile == null)
        fs.delete(outputPath, true);
    long t2 = System.currentTimeMillis();
    System.out.println("Repartitioning and Joining time " + (t2 - t1) + " millis");

    return resultCount;
}