Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.ricemap.spateDB.operations.FileMBR.java

License:Apache License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing//from   w  ww .  j  a  va  2  s  .c  o  m
 * @param conf
 * @param fs
 * @param file
 * @return
 * @throws IOException 
 */
public static <S extends Shape> Prism fileMBRMapReduce(FileSystem fs, Path file, S stockShape,
        boolean background) throws IOException {
    // Quickly get file MBR if it is globally indexed
    GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(fs, file);
    if (globalIndex != null) {
        // Return the MBR of the global index.
        // Compute file size by adding up sizes of all files assuming they are
        // not compressed
        long totalLength = 0;
        for (Partition p : globalIndex) {
            Path filePath = new Path(file, p.filename);
            if (fs.exists(filePath))
                totalLength += fs.getFileStatus(filePath).getLen();
        }
        sizeOfLastProcessedFile = totalLength;
        return globalIndex.getMBR();
    }
    JobConf job = new JobConf(FileMBR.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(file.toUri().getPath() + ".mbr_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("FileMBR");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Prism.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Reduce.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setInputFormat(ShapeInputFormat.class);
    SpatialSite.setShapeClass(job, stockShape.getClass());
    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, file);
    TextOutputFormat.setOutputPath(job, outputPath);
    job.setOutputCommitter(MBROutputCommitter.class);

    // Submit the job
    if (background) {
        JobClient jc = new JobClient(job);
        lastSubmittedJob = jc.submitJob(job);
        return null;
    } else {
        lastSubmittedJob = JobClient.runJob(job);
        Counters counters = lastSubmittedJob.getCounters();
        Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES);
        FileMBR.sizeOfLastProcessedFile = inputBytesCounter.getValue();

        // Read job result
        FileStatus[] results = outFs.listStatus(outputPath);
        Prism mbr = new Prism();
        for (FileStatus fileStatus : results) {
            if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
                LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
                Text text = new Text();
                if (lineReader.readLine(text) > 0) {
                    mbr.fromText(text);
                }
                lineReader.close();
            }
        }

        outFs.delete(outputPath, true);

        return mbr;
    }
}

From source file:com.ricemap.spateDB.operations.LineRandomizer.java

License:Apache License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing/*from  w  ww  . ja  v a2  s .c  o  m*/
 * @param conf
 * @param infs
 * @param infile
 * @return
 * @throws IOException 
 */
public static void randomizerMapReduce(Path infile, Path outfile, boolean overwrite) throws IOException {
    JobConf job = new JobConf(LineRandomizer.class);

    FileSystem outfs = outfile.getFileSystem(job);

    if (overwrite)
        outfs.delete(outfile, true);

    job.setJobName("Randomizer");
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);

    job.setReducerClass(Reduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    FileSystem infs = infile.getFileSystem(job);
    int numOfPartitions = (int) Math
            .ceil((double) infs.getFileStatus(infile).getLen() / infs.getDefaultBlockSize(outfile));
    job.setInt(NumOfPartitions, numOfPartitions);

    job.setInputFormat(TextInputFormat.class);
    TextInputFormat.setInputPaths(job, infile);

    job.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outfile);

    // Submit the job
    JobClient.runJob(job);
}

From source file:com.ricemap.spateDB.operations.Plot.java

License:Apache License

public static <S extends Shape> void plotMapReduce(Path inFile, Path outFile, Shape shape, int width,
        int height, Color color, boolean showBorders, boolean showBlockCount, boolean showRecordCount,
        boolean background) throws IOException {
    JobConf job = new JobConf(Plot.class);
    job.setJobName("Plot");

    job.setMapperClass(PlotMap.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setReducerClass(PlotReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));
    job.setMapOutputKeyClass(Prism.class);
    SpatialSite.setShapeClass(job, shape.getClass());
    job.setMapOutputValueClass(shape.getClass());

    FileSystem inFs = inFile.getFileSystem(job);
    Prism fileMbr = FileMBR.fileMBRMapReduce(inFs, inFile, shape, false);
    FileStatus inFileStatus = inFs.getFileStatus(inFile);

    CellInfo[] cellInfos;/*  ww w .j  av a2s .c  o m*/
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(inFs, inFile);
    if (gindex == null) {
        // A heap file. The map function should partition the file
        GridInfo gridInfo = new GridInfo(fileMbr.t1, fileMbr.x1, fileMbr.y1, fileMbr.t2, fileMbr.x2,
                fileMbr.y2);
        gridInfo.calculateCellDimensions(inFileStatus.getLen(), inFileStatus.getBlockSize());
        cellInfos = gridInfo.getAllCells();
        // Doesn't make sense to show any partition information in a heap file
        showBorders = showBlockCount = showRecordCount = false;
    } else {
        cellInfos = SpatialSite.cellsOf(inFs, inFile);
    }

    // Set cell information in the job configuration to be used by the mapper
    SpatialSite.setCells(job, cellInfos);

    // Adjust width and height to maintain aspect ratio
    if ((fileMbr.x2 - fileMbr.x1) / (fileMbr.y2 - fileMbr.y1) > (double) width / height) {
        // Fix width and change height
        height = (int) ((fileMbr.y2 - fileMbr.y1) * width / (fileMbr.x2 - fileMbr.x1));
    } else {
        width = (int) ((fileMbr.x2 - fileMbr.x1) * height / (fileMbr.y2 - fileMbr.y1));
    }
    LOG.info("Creating an image of size " + width + "x" + height);
    ImageOutputFormat.setFileMBR(job, fileMbr);
    ImageOutputFormat.setImageWidth(job, width);
    ImageOutputFormat.setImageHeight(job, height);
    job.setBoolean(ShowBorders, showBorders);
    job.setBoolean(ShowBlockCount, showBlockCount);
    job.setBoolean(ShowRecordCount, showRecordCount);
    job.setInt(StrokeColor, color.getRGB());

    // Set input and output
    job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    // Set output committer which will stitch images together after all reducers
    // finish
    job.setOutputCommitter(PlotOutputCommitter.class);

    job.setOutputFormat(ImageOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outFile);

    if (background) {
        JobClient jc = new JobClient(job);
        lastSubmittedJob = jc.submitJob(job);
    } else {
        lastSubmittedJob = JobClient.runJob(job);
    }
}

From source file:com.ricemap.spateDB.operations.RangeQuery.java

License:Apache License

/**
 * Performs a range query using MapReduce
 * /*from   w w w .j  av a2  s.  c  o  m*/
 * @param fs
 * @param inputFile
 * @param queryRange
 * @param shape
 * @param output
 * @return
 * @throws IOException
 */
public static long rangeQueryMapReduce(FileSystem fs, Path inputFile, Path userOutputPath, Shape queryShape,
        Shape shape, boolean overwrite, boolean background, QueryInput query) throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    FileSystem outFs = inputFile.getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(
                    inputFile.toUri().getPath() + ".rangequery_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    } else {
        if (outFs.exists(outputPath)) {
            if (overwrite) {
                outFs.delete(outputPath, true);
            } else {
                throw new RuntimeException("Output path already exists and -overwrite flag is not set");
            }
        }
    }

    job.setJobName("RangeQuery");
    job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    RangeFilter.setQueryRange(job, queryShape); // Set query range for
    // filter

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(3);

    // Decide which map function to use depending on how blocks are indexed
    // And also which input format to use
    if (SpatialSite.isRTree(fs, inputFile)) {
        // RTree indexed file
        LOG.info("Searching an RTree indexed file");
        job.setInputFormat(RTreeInputFormat.class);
    } else {
        // A file with no local index
        LOG.info("Searching a non local-indexed file");
        job.setInputFormat(ShapeInputFormat.class);
    }

    GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inputFile);
    // if (gIndex != null && gIndex.isReplicated()){
    // job.setMapperClass(RangeQueryMap.class);

    Class<?> OutputKey = NullWritable.class;
    try {
        Class<?> c = shape.getClass();
        Field f = c.getDeclaredField(query.field);
        f.setAccessible(true);
        if (f.getType().equals(Integer.TYPE)) {
            OutputKey = IntWritable.class;
        } else if (f.getType().equals(Double.TYPE)) {
            OutputKey = DoubleWritable.class;
        } else if (f.getType().equals(Long.TYPE)) {
            OutputKey = LongWritable.class;
        }
    } catch (SecurityException e) {
        e.printStackTrace();
    } catch (NoSuchFieldException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    job.setMapOutputKeyClass(OutputKey);
    switch (query.type) {
    case Distinct:
        job.setMapperClass(DistinctQueryMap.class);
        job.setReducerClass(DistinctQueryReduce.class);
        job.setMapOutputValueClass(NullWritable.class);
        break;
    case Distribution:
        job.setMapperClass(DistributionQueryMap.class);
        job.setReducerClass(DistributionQueryReduce.class);
        job.setMapOutputValueClass(IntWritable.class);
        break;
    default:
        break;
    }
    // }
    // else
    // job.setMapperClass(RangeQueryMapNoDupAvoidance.class);

    // Set query range for the map function
    job.set(QUERY_SHAPE_CLASS, queryShape.getClass().getName());
    job.set(QUERY_SHAPE, queryShape.toText(new Text()).toString());
    job.set(QUERY_FIELD, query.field);

    // Set shape class for the SpatialInputFormat
    SpatialSite.setShapeClass(job, shape.getClass());

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, inputFile);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    if (!background) {
        RunningJob runningJob = JobClient.runJob(job);
        Counters counters = runningJob.getCounters();
        Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
        final long resultCount = outputRecordCounter.getValue();

        // If outputPath not set by user, automatically delete it
        if (userOutputPath == null)
            outFs.delete(outputPath, true);

        return resultCount;
    } else {
        JobClient jc = new JobClient(job);
        lastRunningJob = jc.submitJob(job);
        return -1;
    }
}

From source file:com.ricemap.spateDB.operations.RecordCount.java

License:Apache License

/**
 * Counts the exact number of lines in a file by issuing a MapReduce job
 * that does the thing//  www  .  ja  v a  2s . c om
 * @param conf
 * @param fs
 * @param file
 * @return
 * @throws IOException 
 */
public static long recordCountMapReduce(FileSystem fs, Path file) throws IOException {
    JobConf job = new JobConf(RecordCount.class);

    Path outputPath = new Path(file.toUri().getPath() + ".linecount");
    FileSystem outFs = outputPath.getFileSystem(job);
    outFs.delete(outputPath, true);

    job.setJobName("LineCount");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Reduce.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(1);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, file);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    JobClient.runJob(job);

    // Read job result
    long lineCount = 0;
    FileStatus[] results = outFs.listStatus(outputPath);
    for (FileStatus fileStatus : results) {
        if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
            LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
            Text text = new Text();
            if (lineReader.readLine(text) > 0) {
                lineCount = Long.parseLong(text.toString());
            }
            lineReader.close();
        }
    }

    outFs.delete(outputPath, true);

    return lineCount;
}

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

/**
 * Repartitions an input file according to the given list of cells.
 * @param inFile/*from w  w w .ja v a 2s.co m*/
 * @param outPath
 * @param cellInfos
 * @param pack
 * @param rtree
 * @param overwrite
 * @throws IOException
 */
public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite, boolean columnar) throws IOException {
    JobConf job = new JobConf(Repartition.class);
    job.setJobName("Repartition");
    FileSystem outFs = outPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outPath)) {
        if (overwrite)
            outFs.delete(outPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global index
    if (sindex.equals("rtree")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(job, inFile);
    job.setInputFormat(ShapeInputFormat.class);
    boolean pack = sindex.equals("r+tree");
    boolean expand = sindex.equals("rtree");
    job.setBoolean(SpatialSite.PACK_CELLS, pack);
    job.setBoolean(SpatialSite.EXPAND_CELLS, expand);
    job.setStrings(SpatialSite.STORAGE_MODE, columnar ? "columnar" : "normal");

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    // Set default parameters for reading input file
    SpatialSite.setShapeClass(job, stockShape.getClass());

    FileOutputFormat.setOutputPath(job, outPath);
    if (sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }
    // Copy block size from source file if it's globally indexed
    FileSystem inFs = inFile.getFileSystem(job);

    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();
            LOG.info("Automatically setting block size to " + blockSize);
        }
    }

    if (blockSize != 0)
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blockSize);
    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Sample a ratio of the file through a MapReduce job
 * @param fs/*from www  .j a va 2s.  c o  m*/
 * @param files
 * @param ratio
 * @param threshold - Maximum number of elements to be sampled
 * @param output
 * @param inObj
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio(
        FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output,
        T inObj, O outObj) throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Sample");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setClass(InClass, inObj.getClass(), TextSerializable.class);
    job.setClass(OutClass, outObj.getClass(), TextSerializable.class);

    job.setMapperClass(Map.class);
    job.setLong(RANDOM_SEED, seed);
    job.setFloat(SAMPLE_RATIO, (float) ratio);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(0);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    RunningJob run_job = JobClient.runJob(job);

    Counters counters = run_job.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES);
    Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue();

    // Ratio of records to return from output based on the threshold
    // Note that any number greater than or equal to one will cause all
    // elements to be returned
    final double selectRatio = (double) threshold / resultCount;

    // Read job result
    int result_size = 0;
    if (output != null) {
        Text line = new Text();
        FileStatus[] results = outFs.listStatus(outputPath);

        for (FileStatus fileStatus : results) {
            if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
                LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
                try {
                    while (lineReader.readLine(line) > 0) {
                        if (Math.random() < selectRatio) {
                            if (output != null) {
                                outObj.fromText(line);
                                output.collect(outObj);
                            }
                            result_size++;
                        }
                    }
                } catch (RuntimeException e) {
                    e.printStackTrace();
                }
                lineReader.close();
            }
        }
    }

    outFs.delete(outputPath, true);

    return result_size;
}

From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java

License:Apache License

public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape,
        String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite)
        throws IOException {
    JobConf job = new JobConf(RandomSpatialGenerator.class);

    job.setJobName("Generator");
    FileSystem outFs = file.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(file)) {
        if (overwrite)
            outFs.delete(file, true);/*from w  ww .j  a v  a2s .  com*/
        else
            throw new RuntimeException(
                    "Output file '" + file + "' already exists and overwrite flag is not set");
    }

    // Set generation parameters in job
    job.setLong(RandomShapeGenerator.GenerationSize, size);
    SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr);
    if (seed != 0)
        job.setLong(RandomShapeGenerator.GenerationSeed, seed);
    if (rectsize != 0)
        job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize);
    if (type != null)
        job.set(RandomShapeGenerator.GenerationType, type.toString());

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    // Set input format and map class
    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Repartition.RepartitionMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setShapeClass(job, shape.getClass());

    if (blocksize != 0) {
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize);
    }

    CellInfo[] cells;
    if (sindex == null) {
        cells = new CellInfo[] { new CellInfo(1, mbr) };
    } else if (sindex.equals("grid")) {
        GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2);
        FileSystem fs = file.getFileSystem(job);
        if (blocksize == 0) {
            blocksize = fs.getDefaultBlockSize(file);
        }
        int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize);
        gridInfo.calculateCellDimensions(numOfCells);
        cells = gridInfo.getAllCells();
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cells);

    // Do not set a reduce function. Use the default identity reduce function
    if (cells.length == 1) {
        // All objects are in one partition. No need for a reduce phase
        job.setNumReduceTasks(0);
    } else {
        // More than one partition. Need a reduce phase to group shapes of the
        // same partition together
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, file);
    if (sindex == null || sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobClient.runJob(job);

    // Concatenate all master files into one file
    FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    String ext = resultFiles[0].getPath().getName()
            .substring(resultFiles[0].getPath().getName().lastIndexOf('.'));
    Path masterPath = new Path(file, "_master" + ext);
    OutputStream destOut = outFs.create(masterPath);
    byte[] buffer = new byte[4096];
    for (FileStatus f : resultFiles) {
        InputStream in = outFs.open(f.getPath());
        int bytes_read;
        do {
            bytes_read = in.read(buffer);
            if (bytes_read > 0)
                destOut.write(buffer, 0, bytes_read);
        } while (bytes_read > 0);
        in.close();
        outFs.delete(f.getPath(), false);
    }
    destOut.close();

    // Plot an image for the partitions used in file
    Path imagePath = new Path(file, "_partitions.png");
    int imageSize = (int) (Math.sqrt(cells.length) * 300);
    Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false,
            false);
}

From source file:com.scaleoutsoftware.soss.hserver.Test_MapToMapCopyMapred.java

License:Apache License

public int run(String[] args) throws Exception {
    final NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap("mapr-i",
            new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class));
    final NamedMap<IntWritable, Text> outputMap = NamedMapFactory.getMap("mapr-o",
            new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class));
    inputMap.clear();//www. ja  v  a 2s  .com
    outputMap.clear();
    Thread.sleep(15000);
    BulkLoader<IntWritable, Text> put = inputMap.getBulkLoader();
    String content = "xcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
    Text contentW = new Text(content);
    IntWritable count = new IntWritable();
    int expectedSize = 10000;

    for (int i = 0; i < expectedSize; i++) {
        count.set(i);
        put.put(count, contentW);
    }
    put.close();
    InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis())
            .addClass(Test_MapToMapCopyMapred.class).load();

    JobConf configuration = new JobConf(getConf(), Test_MapToMapCopyMapred.class);
    configuration.setInt("mapred.hserver.setting.reducer.usememorymappedfiles", 0);
    configuration.setMapOutputKeyClass(IntWritable.class);
    configuration.setMapOutputValueClass(Text.class);
    configuration.setOutputKeyClass(IntWritable.class);
    configuration.setOutputValueClass(Text.class);
    configuration.setInputFormat(NamedMapInputFormatMapred.class);
    configuration.setOutputFormat(NamedMapOutputFormatMapred.class);
    NamedMapInputFormatMapred.setNamedMap(configuration, inputMap);
    NamedMapOutputFormatMapred.setNamedMap(configuration, outputMap);
    assertEquals(inputMap.size(), outputMap.size() + expectedSize); // should be 0 + expected
    HServerJobClient.runJob(configuration, false, grid);
    assertEquals(inputMap.size(), outputMap.size());
    inputMap.clear();
    outputMap.clear();
    grid.unload();
    return 1;
}

From source file:com.spotify.hdfs2cass.BulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLine cmdLine = parseOptions(args);

    String[] inputPaths = cmdLine.getOptionValues('i');
    String seedNodeHost = cmdLine.getOptionValue('h');
    String seedNodePort = cmdLine.getOptionValue('p', "9160");
    String keyspace = cmdLine.getOptionValue('k');
    String colfamily = cmdLine.getOptionValue('c');
    int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0"));
    Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0"));
    String poolName = cmdLine.getOptionValue("pool");

    ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort);
    clusterInfo.init(keyspace);/*from   w ww . j av a 2  s .  c  o  m*/

    final String partitionerClass = clusterInfo.getPartitionerClass();
    final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")),
            clusterInfo.getNumClusterNodes());

    Configuration conf = new Configuration();
    ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily);
    ConfigHelper.setOutputInitialAddress(conf, seedNodeHost);
    ConfigHelper.setOutputRpcPort(conf, seedNodePort);
    ConfigHelper.setOutputPartitioner(conf, partitionerClass);

    if (cmdLine.hasOption('s')) {
        conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32"));
    }

    if (cmdLine.hasOption('M')) {
        conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M'));
    }

    if (cmdLine.hasOption('C')) {
        ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C'));
    }

    if (cmdLine.hasOption('b')) {
        conf.setBoolean("com.spotify.hdfs2cass.base64", true);
    }

    JobConf job = new JobConf(conf);

    if (mappers > 0)
        job.setNumMapTasks(mappers);
    if (reducers > 0)
        job.setNumReduceTasks(reducers);
    if (copiers > 0)
        job.set("mapred.reduce.parallel.copies", copiers.toString());

    if (poolName != null)
        job.set("mapred.fairscheduler.pool", poolName);

    // set the nodes as a param for the other hadoop nodes
    clusterInfo.setConf(job);

    String jobName = "bulkloader-hdfs-to-cassandra";
    if (cmdLine.hasOption('n'))
        jobName += "-" + cmdLine.getOptionValue('n');
    job.setJobName(jobName);
    job.setJarByClass(BulkLoader.class);

    job.setInputFormat(AvroAsTextInputFormat.class);

    for (String inputPath : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    }

    //map just outputs text, reduce sends to cassandra
    job.setMapperClass(MapToText.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(CassandraPartitioner.class);

    job.setReducerClass(ReduceTextToCassandra.class);
    job.setOutputKeyClass(ByteBuffer.class);
    job.setOutputValueClass(List.class);

    if (cmdLine.hasOption('s'))
        job.setOutputFormat(BulkOutputFormat.class);
    else
        job.setOutputFormat(ColumnFamilyOutputFormat.class);

    JobClient.runJob(job);
    return 0;
}