Example usage for org.apache.hadoop.mapred JobConf setClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setClass.

Prototype

public void setClass(String name, Class<?> theClass, Class<?> xface)

Source Link

Document

Set the value of the name property to the name of a theClass implementing the given interface xface.

Usage

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Sample a ratio of the file through a MapReduce job
 * @param fs/*www.j  a va  2  s  . c  o  m*/
 * @param files
 * @param ratio
 * @param threshold - Maximum number of elements to be sampled
 * @param output
 * @param inObj
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio(
        FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output,
        T inObj, O outObj) throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Sample");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setClass(InClass, inObj.getClass(), TextSerializable.class);
    job.setClass(OutClass, outObj.getClass(), TextSerializable.class);

    job.setMapperClass(Map.class);
    job.setLong(RANDOM_SEED, seed);
    job.setFloat(SAMPLE_RATIO, (float) ratio);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(0);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    RunningJob run_job = JobClient.runJob(job);

    Counters counters = run_job.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES);
    Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue();

    // Ratio of records to return from output based on the threshold
    // Note that any number greater than or equal to one will cause all
    // elements to be returned
    final double selectRatio = (double) threshold / resultCount;

    // Read job result
    int result_size = 0;
    if (output != null) {
        Text line = new Text();
        FileStatus[] results = outFs.listStatus(outputPath);

        for (FileStatus fileStatus : results) {
            if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
                LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
                try {
                    while (lineReader.readLine(line) > 0) {
                        if (Math.random() < selectRatio) {
                            if (output != null) {
                                outObj.fromText(line);
                                output.collect(outObj);
                            }
                            result_size++;
                        }
                    }
                } catch (RuntimeException e) {
                    e.printStackTrace();
                }
                lineReader.close();
            }
        }
    }

    outFs.delete(outputPath, true);

    return result_size;
}

From source file:com.scaleoutsoftware.soss.hserver.NamedMapInputFormatMapred.java

License:Apache License

/**
 * Sets {@link com.scaleoutsoftware.soss.client.map.NamedMap} as an input source for the job.
 *
 * @param configuration job to modify/*from  ww  w .j a  v a2 s  .c  om*/
 * @param map           name of the map to be used as a job input
 * @param <K>           the type of the key
 * @param <V>           the type of the value
 */
public static <K, V> void setNamedMap(JobConf configuration, NamedMap<K, V> map) {
    configuration.setInt(inputAppIdProperty, map.getMapId());
    CustomSerializer<K> keySerializer = map.getKeySerializer();
    CustomSerializer<V> valueSerializer = map.getValueSerializer();
    configuration.setClass(inputNamedMapKeySerializerProperty, keySerializer.getClass(), Object.class);
    configuration.setClass(inputNamedMapValueSerializerProperty, valueSerializer.getClass(), Object.class);
    configuration.setInt(SERIALIZATION_MODE, map.getSerializationMode().ordinal());
    if (keySerializer.getObjectClass() != null) {
        configuration.setClass(inputNamedMapKeyProperty, keySerializer.getObjectClass(), Object.class);
    }
    if (valueSerializer.getObjectClass() != null) {
        configuration.setClass(inputNamedMapValueProperty, valueSerializer.getObjectClass(), Object.class);
    }
}

From source file:com.scaleoutsoftware.soss.hserver.NamedMapOutputFormatMapred.java

License:Apache License

/**
 * Sets the {@link NamedMap} to direct output to.
 *
 * @param configuration job to modify/*from w ww. jav  a  2s. c o  m*/
 * @param map named map to be used for output
 */
public static void setNamedMap(JobConf configuration, NamedMap map) {
    configuration.setStrings(outputNamedMapProperty, map.getMapName());
    CustomSerializer keySerializer = map.getKeySerializer();
    CustomSerializer valueSerializer = map.getValueSerializer();
    configuration.setClass(outputNamedMapKeySerializerProperty, keySerializer.getClass(), Object.class);
    configuration.setClass(outputNamedMapValueSerializerProperty, valueSerializer.getClass(), Object.class);
    if (keySerializer.getObjectClass() != null) {
        configuration.setClass(outputNamedMapKeyProperty, keySerializer.getObjectClass(), Object.class);
    }
    if (valueSerializer.getObjectClass() != null) {
        configuration.setClass(outputNamedMapValueProperty, valueSerializer.getObjectClass(), Object.class);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Aggregate.java

License:Open Source License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing//w  w w.  j av a2  s .  c o  m
 * @param files
 * @param params
 * @return
 * @throws IOException
 */
public static MinMax aggregateMapReduce(Path[] files, OperationsParams params) throws IOException {
    Shape plotRange = params.getShape("rect");
    JobConf job = new JobConf(params, Aggregate.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path("agg_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Aggregate");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(MinMax.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeInputFormat.class);
    job.setClass("shape", NASAPoint.class, Shape.class);
    if (plotRange != null) {
        job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    }

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);

    // Read job result
    FileStatus[] results = outFs.listStatus(outputPath);
    MinMax minMax = new MinMax();
    for (FileStatus status : results) {
        if (status.getLen() > 0 && status.getPath().getName().startsWith("part-")) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(outFs.open(status.getPath())));
            String line;
            MinMax value = new MinMax();
            while ((line = reader.readLine()) != null) {
                value.fromText(new Text(line));
                minMax.expand(value);
            }
            reader.close();
        }
    }

    outFs.delete(outputPath, true);

    return minMax;
}

From source file:edu.umn.cs.spatialHadoop.operations.ConvexHull.java

License:Open Source License

public static void convexHullMapReduce(Path inFile, Path userOutPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, ConvexHull.class);
    Path outPath = userOutPath;/*from   ww w.ja  v  a2 s .c  o m*/
    FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job);
    Shape shape = params.getShape("shape");

    if (outPath == null) {
        do {
            outPath = new Path(inFile.toUri().getPath() + ".convex_hull_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outPath));
    } else {
        if (outFs.exists(outPath)) {
            if (params.getBoolean("overwrite", false)) {
                outFs.delete(outPath, true);
            } else {
                throw new RuntimeException("Output path already exists and -overwrite flag is not set");
            }
        }
    }

    job.setJobName("ConvexHull");
    job.setClass(SpatialSite.FilterClass, ConvexHullFilter.class, BlockFilter.class);
    job.setMapperClass(IdentityMapper.class);
    job.setCombinerClass(ConvexHullReducer.class);
    job.setReducerClass(ConvexHullReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(shape.getClass());
    job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    job.setOutputFormat(GridOutputFormat2.class);
    GridOutputFormat2.setOutputPath(job, outPath);

    JobClient.runJob(job);

    // If outputPath not set by user, automatically delete it
    if (userOutPath == null)
        outFs.delete(outPath, true);
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Performs a redistribute join between the given files using the
* redistribute join algorithm. Currently, we only support a pair of files.
 * @param inFiles//  w w w  .j  a v  a  2  s.  c  o  m
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 */
public static <S extends Shape> long joinStep(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException {
    long t1 = System.currentTimeMillis();

    JobConf job = new JobConf(params, DistributedJoin.class);

    FileSystem fs[] = new FileSystem[inFiles.length];
    for (int i_file = 0; i_file < inFiles.length; i_file++)
        fs[i_file] = inFiles[i_file].getFileSystem(job);

    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (fs[0].exists(outputPath));
    }

    job.setJobName("DistributedJoin");
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    GlobalIndex<Partition> gindex1 = SpatialSite.getGlobalIndex(fs[0], inFiles[0]);
    GlobalIndex<Partition> gindex2 = SpatialSite.getGlobalIndex(fs[1], inFiles[1]);

    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    LOG.info("Joining " + inFiles[0] + " X " + inFiles[1]);

    if (SpatialSite.isRTree(fs[0], inFiles[0]) && SpatialSite.isRTree(fs[1], inFiles[1])) {
        job.setInputFormat(DJInputFormatRTree.class);
    } else {
        if (isOneShotReadMode) {
            // Ensure all objects are read in one shot
            job.setInt(SpatialSite.MaxBytesInOneRead, -1);
            job.setInt(SpatialSite.MaxShapesInOneRead, -1);
        } else {
            job.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead);
            job.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead);
        }
        job.setInputFormat(DJInputFormatArray.class);
    }

    // Set input paths and map function
    if (inFiles[0].equals(inFiles[1])) {
        // Self join
        job.setInputFormat(ShapeArrayInputFormat.class);
        // Remove the spatial filter to ensure all partitions are loaded
        FileInputFormat.setInputPaths(job, inFiles[0]);
        if (gindex1 != null && gindex1.isReplicated())
            job.setMapperClass(RedistributeJoinMap.class);
        else
            job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class);
    } else {
        // Binary version of spatial join (two different input files)
        job.setClass(SpatialSite.FilterClass, SpatialJoinFilter.class, BlockFilter.class);
        FileInputFormat.setInputPaths(job, inFiles);
        if ((gindex1 != null && gindex1.isReplicated()) || (gindex2 != null && gindex2.isReplicated())) {
            // Need the map function with duplicate avoidance step.
            job.setMapperClass(RedistributeJoinMap.class);
        } else {
            // No replication in both indexes, use map function with no dup
            // avoidance
            job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class);
        }
    }

    Shape shape = params.getShape("shape");
    job.setMapOutputKeyClass(shape.getClass());
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setNumReduceTasks(0); // No reduce needed for this task

    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (!params.getBoolean("background", false)) {
        LOG.info("Submit job in sync mode");
        RunningJob runningJob = JobClient.runJob(job);
        Counters counters = runningJob.getCounters();
        Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
        final long resultCount = outputRecordCounter.getValue();

        // Output number of running map tasks
        Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
        System.out.println("Number of map tasks " + mapTaskCountCounter.getValue());

        // Delete output directory if not explicitly set by user
        if (userOutputPath == null)
            fs[0].delete(outputPath, true);
        long t2 = System.currentTimeMillis();
        System.out.println("Join time " + (t2 - t1) + " millis");

        return resultCount;
    } else {
        JobClient jc = new JobClient(job);
        LOG.info("Submit job in async mode");
        lastRunningJob = jc.submitJob(job);
        LOG.info("Job " + lastRunningJob + " submitted successfully");
        return -1;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.PyramidPlot.java

License:Apache License

/**
 * Plot a file to a set of images in different zoom levels using a MapReduce
 * program./*from   ww  w .ja v a 2 s .co m*/
 * @param <S> type of shapes stored in file
 * @param inFile - Path to the input file(s)
 * @param outFile - Path to the output file (image)
 * @param shape - A sample object to be used for parsing input file
 * @param tileWidth - With of each tile 
 * @param tileHeight - Height of each tile
 * @param vflip - Set to <code>true</code> to file the whole image vertically
 * @param color - Color used to draw single shapes
 * @param numLevels - Number of zoom levels to plot
 * @throws IOException
 */
private static <S extends Shape> RunningJob plotMapReduce(Path inFile, Path outFile, OperationsParams params)
        throws IOException {
    Color color = params.getColor("color", Color.BLACK);

    String hdfDataset = (String) params.get("dataset");
    Shape shape = hdfDataset != null ? new NASARectangle() : params.getShape("shape");
    Shape plotRange = params.getShape("rect");

    boolean background = params.is("background");

    JobConf job = new JobConf(params, PyramidPlot.class);
    job.setJobName("PlotPyramid");

    String partition = job.get("partition", "space").toLowerCase();
    if (partition.equals("space")) {
        job.setMapperClass(SpacePartitionMap.class);
        job.setReducerClass(SpacePartitionReduce.class);
        job.setMapOutputKeyClass(TileIndex.class);
        job.setMapOutputValueClass(shape.getClass());
        job.setInputFormat(ShapeInputFormat.class);
    } else {
        job.setMapperClass(DataPartitionMap.class);
        job.setReducerClass(DataPartitionReduce.class);
        job.setMapOutputKeyClass(TileIndex.class);
        job.setMapOutputValueClass(ImageWritable.class);
        job.setInputFormat(ShapeArrayInputFormat.class);
    }

    job.setInt("color", color.getRGB());
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    if (shape instanceof Point && job.getBoolean("sample", false)) {
        // Enable adaptive sampling
        int imageWidthRoot = job.getInt("tilewidth", 256);
        int imageHeightRoot = job.getInt("tileheight", 256);
        long recordCount = FileMBR.fileMBR(inFile, params).recordCount;
        float sampleRatio = params.getFloat(GeometricPlot.AdaptiveSampleFactor, 1.0f) * imageWidthRoot
                * imageHeightRoot / recordCount;
        job.setFloat(GeometricPlot.AdaptiveSampleRatio, sampleRatio);
    }

    Rectangle fileMBR;
    if (hdfDataset != null) {
        // Input is HDF
        job.set(HDFRecordReader.DatasetName, hdfDataset);
        job.setBoolean(HDFRecordReader.SkipFillValue, true);
        job.setClass("shape", NASARectangle.class, Shape.class);
        // Determine the range of values by opening one of the HDF files
        Aggregate.MinMax minMax = Aggregate.aggregate(new Path[] { inFile }, params);
        job.setInt(MinValue, minMax.minValue);
        job.setInt(MaxValue, minMax.maxValue);
        //fileMBR = new Rectangle(-180, -90, 180, 90);
        fileMBR = plotRange != null ? plotRange.getMBR() : new Rectangle(-180, -140, 180, 169);
        //      job.setClass(HDFRecordReader.ProjectorClass, MercatorProjector.class,
        //          GeoProjector.class);
    } else {
        fileMBR = FileMBR.fileMBR(inFile, params);
    }

    boolean keepAspectRatio = params.is("keep-ratio", true);
    if (keepAspectRatio) {
        // Expand input file to a rectangle for compatibility with the pyramid
        // structure
        if (fileMBR.getWidth() > fileMBR.getHeight()) {
            fileMBR.y1 -= (fileMBR.getWidth() - fileMBR.getHeight()) / 2;
            fileMBR.y2 = fileMBR.y1 + fileMBR.getWidth();
        } else {
            fileMBR.x1 -= (fileMBR.getHeight() - fileMBR.getWidth() / 2);
            fileMBR.x2 = fileMBR.x1 + fileMBR.getHeight();
        }
    }

    SpatialSite.setRectangle(job, InputMBR, fileMBR);

    // Set input and output
    ShapeInputFormat.addInputPath(job, inFile);
    if (plotRange != null) {
        job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    }

    job.setOutputFormat(PyramidOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outFile);
    job.setOutputCommitter(PlotPyramidOutputCommitter.class);

    if (background) {
        JobClient jc = new JobClient(job);
        return lastSubmittedJob = jc.submitJob(job);
    } else {
        return lastSubmittedJob = JobClient.runJob(job);
    }

}

From source file:edu.umn.cs.spatialHadoop.operations.Skyline.java

License:Open Source License

private static void skylineMapReduce(Path inFile, Path userOutPath, OperationsParams params)
        throws IOException {
    JobConf job = new JobConf(params, Skyline.class);
    Path outPath = userOutPath;//from  w ww.j ava  2 s. co  m
    FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job);
    Shape shape = params.getShape("shape");

    if (outPath == null) {
        do {
            outPath = new Path(inFile.toUri().getPath() + ".skyline_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outPath));
    }

    job.setJobName("Skyline");
    job.setClass(SpatialSite.FilterClass, SkylineFilter.class, BlockFilter.class);
    job.setMapperClass(IdentityMapper.class);
    job.setCombinerClass(SkylineReducer.class);
    job.setReducerClass(SkylineReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(shape.getClass());
    job.setInputFormat(ShapeIterInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outPath);

    JobClient.runJob(job);

    // If outputPath not set by user, automatically delete it
    if (userOutPath == null)
        outFs.delete(outPath, true);
}

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

private static void setupPipesJob(JobConf conf) throws IOException {
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        conf.setMapRunnerClass(PipesMapRunner.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, conf.getPartitionerClass());
        conf.setPartitionerClass(PipesPartitioner.class);
    }/* w  w  w.  j a v a  2s  . c  o  m*/
    if (!getIsJavaReducer(conf)) {
        conf.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            conf.setOutputFormat(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class);
        conf.setInputFormat(PipesNonJavaInputFormat.class);
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        throw new IllegalArgumentException("No application program defined.");
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task 
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        IOException ie = new IOException("Problem parsing execable URI " + exec);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:org.apache.avro.mapred.AvroJob.java

License:Apache License

/** Configure a job's data model implementation class. */
public static void setDataModelClass(JobConf job, Class<? extends GenericData> modelClass) {
    job.setClass(CONF_DATA_MODEL, modelClass, GenericData.class);
}