Example usage for org.apache.hadoop.mapreduce RecordReader close

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader close.

Prototype

public abstract void close() throws IOException;

Source Link

Document

Close the record reader.

Usage

From source file:edu.umn.cs.spatialHadoop.visualization.MultilevelPlot.java

License:Open Source License

private static void plotLocal(Path[] inFiles, final Path outPath, final Class<? extends Plotter> plotterClass,
        final OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException {
    final boolean vflip = params.getBoolean("vflip", true);

    OperationsParams mbrParams = new OperationsParams(params);
    mbrParams.setBoolean("background", false);
    final Rectangle inputMBR = params.get("mbr") != null ? params.getShape("mbr").getMBR()
            : FileMBR.fileMBR(inFiles, mbrParams);
    OperationsParams.setShape(params, InputMBR, inputMBR);

    // Retrieve desired output image size and keep aspect ratio if needed
    int tileWidth = params.getInt("tilewidth", 256);
    int tileHeight = params.getInt("tileheight", 256);
    // Adjust width and height if aspect ratio is to be kept
    if (params.getBoolean("keepratio", true)) {
        // Expand input file to a rectangle for compatibility with the pyramid
        // structure
        if (inputMBR.getWidth() > inputMBR.getHeight()) {
            inputMBR.y1 -= (inputMBR.getWidth() - inputMBR.getHeight()) / 2;
            inputMBR.y2 = inputMBR.y1 + inputMBR.getWidth();
        } else {/*w  ww.ja v  a 2 s.co m*/
            inputMBR.x1 -= (inputMBR.getHeight() - inputMBR.getWidth()) / 2;
            inputMBR.x2 = inputMBR.x1 + inputMBR.getHeight();
        }
    }

    String outFName = outPath.getName();
    int extensionStart = outFName.lastIndexOf('.');
    final String extension = extensionStart == -1 ? ".png" : outFName.substring(extensionStart);

    // Start reading input file
    Vector<InputSplit> splits = new Vector<InputSplit>();
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    for (Path inFile : inFiles) {
        FileSystem inFs = inFile.getFileSystem(params);
        if (!OperationsParams.isWildcard(inFile) && inFs.exists(inFile) && !inFs.isDirectory(inFile)) {
            if (SpatialSite.NonHiddenFileFilter.accept(inFile)) {
                // Use the normal input format splitter to add this non-hidden file
                Job job = Job.getInstance(params);
                SpatialInputFormat3.addInputPath(job, inFile);
                splits.addAll(inputFormat.getSplits(job));
            } else {
                // A hidden file, add it immediately as one split
                // This is useful if the input is a hidden file which is automatically
                // skipped by FileInputFormat. We need to plot a hidden file for the case
                // of plotting partition boundaries of a spatial index
                splits.add(new FileSplit(inFile, 0, inFs.getFileStatus(inFile).getLen(), new String[0]));
            }
        } else {
            Job job = Job.getInstance(params);
            SpatialInputFormat3.addInputPath(job, inFile);
            splits.addAll(inputFormat.getSplits(job));
        }
    }

    try {
        Plotter plotter = plotterClass.newInstance();
        plotter.configure(params);

        String[] strLevels = params.get("levels", "7").split("\\.\\.");
        int minLevel, maxLevel;
        if (strLevels.length == 1) {
            minLevel = 0;
            maxLevel = Integer.parseInt(strLevels[0]);
        } else {
            minLevel = Integer.parseInt(strLevels[0]);
            maxLevel = Integer.parseInt(strLevels[1]);
        }

        GridInfo bottomGrid = new GridInfo(inputMBR.x1, inputMBR.y1, inputMBR.x2, inputMBR.y2);
        bottomGrid.rows = bottomGrid.columns = 1 << maxLevel;

        TileIndex key = new TileIndex();

        // All canvases in the pyramid, one per tile
        Map<TileIndex, Canvas> canvases = new HashMap<TileIndex, Canvas>();
        for (InputSplit split : splits) {
            FileSplit fsplit = (FileSplit) split;
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(fsplit, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }

            while (reader.nextKeyValue()) {
                Rectangle partition = reader.getCurrentKey();
                if (!partition.isValid())
                    partition.set(inputMBR);

                Iterable<Shape> shapes = reader.getCurrentValue();

                for (Shape shape : shapes) {
                    Rectangle shapeMBR = shape.getMBR();
                    if (shapeMBR == null)
                        continue;
                    java.awt.Rectangle overlappingCells = bottomGrid.getOverlappingCells(shapeMBR);
                    // Iterate over levels from bottom up
                    for (key.level = maxLevel; key.level >= minLevel; key.level--) {
                        for (key.x = overlappingCells.x; key.x < overlappingCells.x
                                + overlappingCells.width; key.x++) {
                            for (key.y = overlappingCells.y; key.y < overlappingCells.y
                                    + overlappingCells.height; key.y++) {
                                Canvas canvas = canvases.get(key);
                                if (canvas == null) {
                                    Rectangle tileMBR = new Rectangle();
                                    int gridSize = 1 << key.level;
                                    tileMBR.x1 = (inputMBR.x1 * (gridSize - key.x) + inputMBR.x2 * key.x)
                                            / gridSize;
                                    tileMBR.x2 = (inputMBR.x1 * (gridSize - (key.x + 1))
                                            + inputMBR.x2 * (key.x + 1)) / gridSize;
                                    tileMBR.y1 = (inputMBR.y1 * (gridSize - key.y) + inputMBR.y2 * key.y)
                                            / gridSize;
                                    tileMBR.y2 = (inputMBR.y1 * (gridSize - (key.y + 1))
                                            + inputMBR.y2 * (key.y + 1)) / gridSize;
                                    canvas = plotter.createCanvas(tileWidth, tileHeight, tileMBR);
                                    canvases.put(key.clone(), canvas);
                                }
                                plotter.plot(canvas, shape);
                            }
                        }
                        // Update overlappingCells for the higher level
                        int updatedX1 = overlappingCells.x / 2;
                        int updatedY1 = overlappingCells.y / 2;
                        int updatedX2 = (overlappingCells.x + overlappingCells.width - 1) / 2;
                        int updatedY2 = (overlappingCells.y + overlappingCells.height - 1) / 2;
                        overlappingCells.x = updatedX1;
                        overlappingCells.y = updatedY1;
                        overlappingCells.width = updatedX2 - updatedX1 + 1;
                        overlappingCells.height = updatedY2 - updatedY1 + 1;
                    }
                }
            }
            reader.close();
        }

        // Done with all splits. Write output to disk
        LOG.info("Done with plotting. Now writing the output");
        final FileSystem outFS = outPath.getFileSystem(params);

        LOG.info("Writing default empty image");
        // Write a default empty image to be displayed for non-generated tiles
        BufferedImage emptyImg = new BufferedImage(tileWidth, tileHeight, BufferedImage.TYPE_INT_ARGB);
        Graphics2D g = new SimpleGraphics(emptyImg);
        g.setBackground(new Color(0, 0, 0, 0));
        g.clearRect(0, 0, tileWidth, tileHeight);
        g.dispose();

        // Write HTML file to browse the mutlielvel image
        OutputStream out = outFS.create(new Path(outPath, "default.png"));
        ImageIO.write(emptyImg, "png", out);
        out.close();

        // Add an HTML file that visualizes the result using Google Maps
        LOG.info("Writing the HTML viewer file");
        LineReader templateFileReader = new LineReader(
                MultilevelPlot.class.getResourceAsStream("/zoom_view.html"));
        PrintStream htmlOut = new PrintStream(outFS.create(new Path(outPath, "index.html")));
        Text line = new Text();
        while (templateFileReader.readLine(line) > 0) {
            String lineStr = line.toString();
            lineStr = lineStr.replace("#{TILE_WIDTH}", Integer.toString(tileWidth));
            lineStr = lineStr.replace("#{TILE_HEIGHT}", Integer.toString(tileHeight));
            lineStr = lineStr.replace("#{MAX_ZOOM}", Integer.toString(maxLevel));
            lineStr = lineStr.replace("#{MIN_ZOOM}", Integer.toString(minLevel));
            lineStr = lineStr.replace("#{TILE_URL}",
                    "'tile-' + zoom + '-' + coord.x + '-' + coord.y + '" + extension + "'");

            htmlOut.println(lineStr);
        }
        templateFileReader.close();
        htmlOut.close();

        // Write the tiles
        final Entry<TileIndex, Canvas>[] entries = canvases.entrySet().toArray(new Map.Entry[canvases.size()]);
        // Clear the hash map to save memory as it is no longer needed
        canvases.clear();
        int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors());
        Parallel.forEach(entries.length, new RunnableRange<Object>() {
            @Override
            public Object run(int i1, int i2) {
                boolean output = params.getBoolean("output", true);
                try {
                    Plotter plotter = plotterClass.newInstance();
                    plotter.configure(params);
                    for (int i = i1; i < i2; i++) {
                        Map.Entry<TileIndex, Canvas> entry = entries[i];
                        TileIndex key = entry.getKey();
                        if (vflip)
                            key.y = ((1 << key.level) - 1) - key.y;

                        Path imagePath = new Path(outPath, key.getImageFileName() + extension);
                        // Write this tile to an image
                        DataOutputStream outFile = output ? outFS.create(imagePath)
                                : new DataOutputStream(new NullOutputStream());
                        plotter.writeImage(entry.getValue(), outFile, vflip);
                        outFile.close();

                        // Remove entry to allows GC to collect it
                        entries[i] = null;
                    }
                    return null;
                } catch (InstantiationException e) {
                    e.printStackTrace();
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                return null;
            }
        }, parallelism);
    } catch (InstantiationException e) {
        throw new RuntimeException("Error creating rastierizer", e);
    } catch (IllegalAccessException e) {
        throw new RuntimeException("Error creating rastierizer", e);
    }
}

From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java

License:Open Source License

public static void plotLocal(Path[] inFiles, Path outFile, final Class<? extends Plotter> plotterClass,
        final OperationsParams params) throws IOException, InterruptedException {
    OperationsParams mbrParams = new OperationsParams(params);
    mbrParams.setBoolean("background", false);
    final Rectangle inputMBR = params.get(InputMBR) != null ? params.getShape("mbr").getMBR()
            : FileMBR.fileMBR(inFiles, mbrParams);
    if (params.get(InputMBR) == null)
        OperationsParams.setShape(params, InputMBR, inputMBR);

    // Retrieve desired output image size and keep aspect ratio if needed
    int width = params.getInt("width", 1000);
    int height = params.getInt("height", 1000);
    if (params.getBoolean("keepratio", true)) {
        // Adjust width and height to maintain aspect ratio and store the adjusted
        // values back in params in case the caller needs to retrieve them
        if (inputMBR.getWidth() / inputMBR.getHeight() > (double) width / height)
            params.setInt("height", height = (int) (inputMBR.getHeight() * width / inputMBR.getWidth()));
        else/*www .  jav a  2s  .  c  o  m*/
            params.setInt("width", width = (int) (inputMBR.getWidth() * height / inputMBR.getHeight()));
    }
    // Store width and height in final variables to make them accessible in parallel
    final int fwidth = width, fheight = height;

    // Start reading input file
    List<InputSplit> splits = new ArrayList<InputSplit>();
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    for (Path inFile : inFiles) {
        FileSystem inFs = inFile.getFileSystem(params);
        if (!OperationsParams.isWildcard(inFile) && inFs.exists(inFile) && !inFs.isDirectory(inFile)) {
            if (SpatialSite.NonHiddenFileFilter.accept(inFile)) {
                // Use the normal input format splitter to add this non-hidden file
                Job job = Job.getInstance(params);
                SpatialInputFormat3.addInputPath(job, inFile);
                splits.addAll(inputFormat.getSplits(job));
            } else {
                // A hidden file, add it immediately as one split
                // This is useful if the input is a hidden file which is automatically
                // skipped by FileInputFormat. We need to plot a hidden file for the case
                // of plotting partition boundaries of a spatial index
                splits.add(new FileSplit(inFile, 0, inFs.getFileStatus(inFile).getLen(), new String[0]));
            }
        } else {
            // Use the normal input format splitter to add this non-hidden file
            Job job = Job.getInstance(params);
            SpatialInputFormat3.addInputPath(job, inFile);
            splits.addAll(inputFormat.getSplits(job));
        }
    }

    // Copy splits to a final array to be used in parallel
    final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]);
    int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors());
    List<Canvas> partialCanvases = Parallel.forEach(fsplits.length, new RunnableRange<Canvas>() {
        @Override
        public Canvas run(int i1, int i2) {
            Plotter plotter;
            try {
                plotter = plotterClass.newInstance();
            } catch (InstantiationException e) {
                throw new RuntimeException("Error creating rastierizer", e);
            } catch (IllegalAccessException e) {
                throw new RuntimeException("Error creating rastierizer", e);
            }
            plotter.configure(params);
            // Create the partial layer that will contain the plot of the assigned partitions
            Canvas partialCanvas = plotter.createCanvas(fwidth, fheight, inputMBR);

            for (int i = i1; i < i2; i++) {
                try {
                    RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplits[i],
                            null);
                    if (reader instanceof SpatialRecordReader3) {
                        ((SpatialRecordReader3) reader).initialize(fsplits[i], params);
                    } else if (reader instanceof RTreeRecordReader3) {
                        ((RTreeRecordReader3) reader).initialize(fsplits[i], params);
                    } else if (reader instanceof HDFRecordReader) {
                        ((HDFRecordReader) reader).initialize(fsplits[i], params);
                    } else {
                        throw new RuntimeException("Unknown record reader");
                    }

                    while (reader.nextKeyValue()) {
                        Rectangle partition = reader.getCurrentKey();
                        if (!partition.isValid())
                            partition.set(inputMBR);

                        Iterable<Shape> shapes = reader.getCurrentValue();
                        // Run the plot step
                        plotter.plot(partialCanvas, plotter.isSmooth() ? plotter.smooth(shapes) : shapes);
                    }
                    reader.close();
                } catch (IOException e) {
                    throw new RuntimeException("Error reading the file ", e);
                } catch (InterruptedException e) {
                    throw new RuntimeException("Interrupt error ", e);
                }
            }
            return partialCanvas;
        }
    }, parallelism);
    boolean merge = params.getBoolean("merge", true);
    Plotter plotter;
    try {
        plotter = plotterClass.newInstance();
        plotter.configure(params);
    } catch (InstantiationException e) {
        throw new RuntimeException("Error creating plotter", e);
    } catch (IllegalAccessException e) {
        throw new RuntimeException("Error creating plotter", e);
    }

    // Whether we should vertically flip the final image or not
    boolean vflip = params.getBoolean("vflip", true);
    if (merge) {
        LOG.info("Merging " + partialCanvases.size() + " partial canvases");
        // Create the final canvas that will contain the final image
        Canvas finalCanvas = plotter.createCanvas(fwidth, fheight, inputMBR);
        for (Canvas partialCanvas : partialCanvases)
            plotter.merge(finalCanvas, partialCanvas);

        // Finally, write the resulting image to the given output path
        LOG.info("Writing final image");
        FileSystem outFs = outFile.getFileSystem(params);
        FSDataOutputStream outputFile = outFs.create(outFile);

        plotter.writeImage(finalCanvas, outputFile, vflip);
        outputFile.close();
    } else {
        // No merge
        LOG.info("Writing partial images");
        FileSystem outFs = outFile.getFileSystem(params);
        for (int i = 0; i < partialCanvases.size(); i++) {
            Path filename = new Path(outFile, String.format("part-%05d.png", i));
            FSDataOutputStream outputFile = outFs.create(filename);

            plotter.writeImage(partialCanvases.get(i), outputFile, vflip);
            outputFile.close();
        }
    }
}

From source file:edu.umn.cs.sthadoop.operations.HSPKNNQ.java

License:Open Source License

private static <S extends Shape> long knnLocal(Path inFile, Path outPath, OperationsParams params)
        throws IOException, InterruptedException {
    int iterations = 0;
    FileSystem fs = inFile.getFileSystem(params);
    Point queryPoint = (Point) OperationsParams.getShape(params, "point");
    int k = params.getInt("k", 1);
    // Top-k objects are retained in this object
    PriorityQueue<ShapeWithDistance<S>> knn = new KNNObjects<ShapeWithDistance<S>>(k);

    SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();

    final GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inFile);
    double kthDistance = Double.MAX_VALUE;
    if (gIndex != null) {
        // There is a global index, use it
        PriorityQueue<ShapeWithDistance<Partition>> partitionsToProcess = new PriorityQueue<HSPKNNQ.ShapeWithDistance<Partition>>() {
            {/*w ww  . j a  va  2s. com*/
                initialize(gIndex.size());
            }

            @Override
            protected boolean lessThan(Object a, Object b) {
                return ((ShapeWithDistance<Partition>) a).distance < ((ShapeWithDistance<Partition>) b).distance;
            }
        };
        for (Partition p : gIndex) {
            double distance = p.getMinDistanceTo(queryPoint.x, queryPoint.y);
            partitionsToProcess.insert(new ShapeWithDistance<Partition>(p.clone(), distance));
        }

        while (partitionsToProcess.size() > 0 && partitionsToProcess.top().distance <= kthDistance) {

            ShapeWithDistance<Partition> partitionToProcess = partitionsToProcess.pop();
            // Process this partition
            Path partitionPath = new Path(inFile, partitionToProcess.shape.filename);
            long length = fs.getFileStatus(partitionPath).getLen();
            FileSplit fsplit = new FileSplit(partitionPath, 0, length, new String[0]);
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(fsplit, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }
            iterations++;

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                for (Shape shape : shapes) {
                    double distance = shape.distanceTo(queryPoint.x, queryPoint.y);
                    if (distance <= kthDistance)
                        knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance));
                }
            }
            reader.close();

            if (knn.size() >= k)
                kthDistance = knn.top().distance;
        }
    } else {
        // No global index, have to scan the whole file
        Job job = new Job(params);
        SpatialInputFormat3.addInputPath(job, inFile);
        List<InputSplit> splits = inputFormat.getSplits(job);

        for (InputSplit split : splits) {
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(split, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(split, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(split, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(split, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }
            iterations++;

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                for (Shape shape : shapes) {
                    double distance = shape.distanceTo(queryPoint.x, queryPoint.y);
                    knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance));
                }
            }

            reader.close();
        }
        if (knn.size() >= k)
            kthDistance = knn.top().distance;
    }
    long resultCount = knn.size();
    if (outPath != null && params.getBoolean("output", true)) {
        FileSystem outFS = outPath.getFileSystem(params);
        PrintStream ps = new PrintStream(outFS.create(outPath));
        Vector<ShapeWithDistance<S>> resultsOrdered = new Vector<ShapeWithDistance<S>>((int) resultCount);
        resultsOrdered.setSize((int) resultCount);
        while (knn.size() > 0) {
            ShapeWithDistance<S> nextAnswer = knn.pop();
            resultsOrdered.set(knn.size(), nextAnswer);
        }

        Text text = new Text();
        for (ShapeWithDistance<S> answer : resultsOrdered) {
            text.clear();
            TextSerializerHelper.serializeDouble(answer.distance, text, ',');
            answer.shape.toText(text);
            ps.println(text);
        }
        ps.close();
    }
    TotalIterations.addAndGet(iterations);
    return resultCount;

}

From source file:edu.umn.cs.sthadoop.operations.STRangeQuery.java

License:Open Source License

/**
 * Runs a range query on the local machine (no MapReduce) and the output is
 * streamed to the provided result collector. The query might run in
 * parallel which makes it necessary to design the result collector to
 * accept parallel calls to the method/*from  w ww.j a v  a 2  s . co  m*/
 * {@link ResultCollector#collect(Object)}. You can use
 * {@link ResultCollectorSynchronizer} to synchronize calls to your
 * ResultCollector if you cannot design yours to be thread safe.
 * 
 * @param inPath
 * @param queryRange
 * @param shape
 * @param params
 * @param output
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
public static <S extends Shape> long rangeQueryLocal(Path inPath, final Shape queryRange, final S shape,
        final OperationsParams params, final ResultCollector<S> output)
        throws IOException, InterruptedException {
    // Set MBR of query shape in job configuration to work with the spatial
    // filter
    OperationsParams.setShape(params, SpatialInputFormat3.InputQueryRange, queryRange.getMBR());
    // 1- Split the input path/file to get splits that can be processed
    // independently
    final SpatialInputFormat3<Rectangle, S> inputFormat = new SpatialInputFormat3<Rectangle, S>();
    Job job = Job.getInstance(params);
    SpatialInputFormat3.setInputPaths(job, inPath);
    final List<InputSplit> splits = inputFormat.getSplits(job);

    // 2- Process splits in parallel
    List<Long> results = Parallel.forEach(splits.size(), new RunnableRange<Long>() {
        @Override
        public Long run(int i1, int i2) {
            long results = 0;
            for (int i = i1; i < i2; i++) {
                try {
                    FileSplit fsplit = (FileSplit) splits.get(i);
                    final RecordReader<Rectangle, Iterable<S>> reader = inputFormat.createRecordReader(fsplit,
                            null);
                    if (reader instanceof SpatialRecordReader3) {
                        ((SpatialRecordReader3) reader).initialize(fsplit, params);
                    } else if (reader instanceof RTreeRecordReader3) {
                        ((RTreeRecordReader3) reader).initialize(fsplit, params);
                    } else if (reader instanceof HDFRecordReader) {
                        ((HDFRecordReader) reader).initialize(fsplit, params);
                    } else {
                        throw new RuntimeException("Unknown record reader");
                    }
                    while (reader.nextKeyValue()) {
                        Iterable<S> shapes = reader.getCurrentValue();
                        for (Shape s : shapes) {
                            results++;
                            if (output != null)
                                output.collect((S) s);
                        }
                    }
                    reader.close();
                } catch (IOException e) {
                    LOG.error("Error processing split " + splits.get(i), e);
                } catch (InterruptedException e) {
                    LOG.error("Error processing split " + splits.get(i), e);
                }
            }
            return results;
        }
    });
    long totalResultSize = 0;
    for (long result : results)
        totalResultSize += result;
    return totalResultSize;
}

From source file:edu.umn.cs.sthadoop.trajectory.KNNDTW.java

License:Open Source License

private static <S extends Shape> long knnLocal(Path inFile, Path outPath, OperationsParams params)
        throws IOException, InterruptedException {
    int iterations = 0;
    FileSystem fs = inFile.getFileSystem(params);
    Point queryPoint = (Point) OperationsParams.getShape(params, "point");
    int k = params.getInt("k", 1);
    // Top-k objects are retained in this object
    PriorityQueue<ShapeWithDistance<S>> knn = new KNNObjects<ShapeWithDistance<S>>(k);

    SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();

    final GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inFile);
    double kthDistance = Double.MAX_VALUE;
    if (gIndex != null) {
        // There is a global index, use it
        PriorityQueue<ShapeWithDistance<Partition>> partitionsToProcess = new PriorityQueue<KNNDTW.ShapeWithDistance<Partition>>() {
            {//from   www.ja v a 2s  .c om
                initialize(gIndex.size());
            }

            @Override
            protected boolean lessThan(Object a, Object b) {
                return ((ShapeWithDistance<Partition>) a).distance < ((ShapeWithDistance<Partition>) b).distance;
            }
        };
        for (Partition p : gIndex) {
            double distance = p.getMinDistanceTo(queryPoint.x, queryPoint.y);
            partitionsToProcess.insert(new ShapeWithDistance<Partition>(p.clone(), distance));
        }

        while (partitionsToProcess.size() > 0 && partitionsToProcess.top().distance <= kthDistance) {

            ShapeWithDistance<Partition> partitionToProcess = partitionsToProcess.pop();
            // Process this partition
            Path partitionPath = new Path(inFile, partitionToProcess.shape.filename);
            long length = fs.getFileStatus(partitionPath).getLen();
            FileSplit fsplit = new FileSplit(partitionPath, 0, length, new String[0]);
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(fsplit, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }
            iterations++;

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                for (Shape shape : shapes) {
                    double distance = shape.distanceTo(queryPoint.x, queryPoint.y);
                    if (distance <= kthDistance)
                        knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance));
                }
            }
            reader.close();

            if (knn.size() >= k)
                kthDistance = knn.top().distance;
        }
    } else {
        // No global index, have to scan the whole file
        Job job = new Job(params);
        SpatialInputFormat3.addInputPath(job, inFile);
        List<InputSplit> splits = inputFormat.getSplits(job);

        for (InputSplit split : splits) {
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(split, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(split, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(split, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(split, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }
            iterations++;

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                for (Shape shape : shapes) {
                    double distance = shape.distanceTo(queryPoint.x, queryPoint.y);
                    knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance));
                }
            }

            reader.close();
        }
        if (knn.size() >= k)
            kthDistance = knn.top().distance;
    }
    long resultCount = knn.size();
    if (outPath != null && params.getBoolean("output", true)) {
        FileSystem outFS = outPath.getFileSystem(params);
        PrintStream ps = new PrintStream(outFS.create(outPath));
        Vector<ShapeWithDistance<S>> resultsOrdered = new Vector<ShapeWithDistance<S>>((int) resultCount);
        resultsOrdered.setSize((int) resultCount);
        while (knn.size() > 0) {
            ShapeWithDistance<S> nextAnswer = knn.pop();
            resultsOrdered.set(knn.size(), nextAnswer);
        }

        Text text = new Text();
        for (ShapeWithDistance<S> answer : resultsOrdered) {
            text.clear();
            TextSerializerHelper.serializeDouble(answer.distance, text, ',');
            answer.shape.toText(text);
            ps.println(text);
        }
        ps.close();
    }
    TotalIterations.addAndGet(iterations);
    return resultCount;

}

From source file:io.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser();

    reader.initialize(split, context);//from ww  w  .  j  av a  2 s .  c o m

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parse(data);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(new DateTime(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:io.druid.data.input.parquet.DruidParquetInputFormatTest.java

License:Apache License

@Test
public void test() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);/*from   w ww.j av  a2 s.com*/

    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig
            .fromFile(new File("example/wikipedia_hadoop_parquet_job.json"));

    config.intoConfiguration(job);

    File testFile = new File("example/wikipedia_list.parquet");
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(DruidParquetInputFormat.class,
            job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);

    reader.initialize(split, context);

    reader.nextKeyValue();

    GenericRecord data = (GenericRecord) reader.getCurrentValue();

    // field not read, should return null
    assertEquals(data.get("added"), null);

    assertEquals(data.get("page"), new Utf8("Gypsy Danger"));

    reader.close();
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java

License:Open Source License

public static void main(String[] args) throws IOException, InterruptedException {

    if (args.length != 2) {
        System.out.println("Usage: <input folder> <output file>");
        System.exit(-1);/*from  ww w.j  a  va2  s  . c om*/
    }

    String inputPath = args[0];
    String outputFile = args[1];

    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(conf);

    FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.toString().endsWith(".parquet");
        }
    });

    Path output = new Path(outputFile);

    fs.delete(output, true);

    ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>();
    inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class);

    Job job = new Job(conf);
    ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>(
            ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ProtoParquetOutputFormat.setEnableDictionary(job, true);

    RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output,
            CompressionCodecName.SNAPPY);

    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();

    for (FileStatus fileStatus : input) {
        System.out.println(fileStatus.getPath().toString());
        splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus)));
    }

    int splitIndex = 0;
    for (ParquetInputSplit split : splits) {

        System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of "
                + splits.size() + ")");

        TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex),
                splitIndex);
        TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

        RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split,
                ctx);
        reader.initialize(split, ctx);

        while (reader.nextKeyValue()) {

            ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue();

            ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder();

            builder.setUrl(record.getUrl());
            builder.setArchiveTime(record.getArchiveTime());

            builder.addAllScripts(record.getScriptsList());
            builder.addAllIframes(record.getIframesList());
            builder.addAllLinks(record.getLinksList());
            builder.addAllImages(record.getImagesList());

            recordWriter.write(null, builder.build());
        }

        if (reader != null) {
            reader.close();
        }

        splitIndex++;
    }

    TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1);
    TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

    if (recordWriter != null) {
        recordWriter.close(ctx);
    }

}

From source file:org.apache.avro.mapreduce.TestAvroKeyInputFormat.java

License:Apache License

/**
 * Verifies that a non-null record reader can be created, and the key/value types are
 * as expected./*  ww w.j a  v a  2  s.com*/
 */
@Test
public void testCreateRecordReader() throws IOException, InterruptedException {
    // Set up the job configuration.
    Job job = new Job();
    AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.STRING));
    Configuration conf = job.getConfiguration();

    FileSplit inputSplit = createMock(FileSplit.class);
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    replay(inputSplit);
    replay(context);

    AvroKeyInputFormat inputFormat = new AvroKeyInputFormat();
    @SuppressWarnings("unchecked")
    RecordReader<AvroKey<Object>, NullWritable> recordReader = inputFormat.createRecordReader(inputSplit,
            context);
    assertNotNull(inputFormat);
    recordReader.close();

    verify(inputSplit);
    verify(context);
}

From source file:org.apache.avro.mapreduce.TestAvroKeyRecordReader.java

License:Apache License

/**
 * Verifies that avro records can be read and progress is reported correctly.
 *//* ww w  .  ja v a2  s  . c o m*/
@Test
public void testReadRecords() throws IOException, InterruptedException {
    // Create the test avro file input with two records:
    //   1. "first"
    //   2. "second"
    final SeekableInput avroFileInput = new SeekableFileInput(
            AvroFiles.createFile(new File(mTempDir.getRoot(), "myStringfile.avro"),
                    Schema.create(Schema.Type.STRING), "first", "second"));

    // Create the record reader.
    Schema readerSchema = Schema.create(Schema.Type.STRING);
    RecordReader<AvroKey<CharSequence>, NullWritable> recordReader = new AvroKeyRecordReader<CharSequence>(
            readerSchema) {
        @Override
        protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException {
            return avroFileInput;
        }
    };

    // Set up the job configuration.
    Configuration conf = new Configuration();

    // Create a mock input split for this record reader.
    FileSplit inputSplit = createMock(FileSplit.class);
    expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
    expect(inputSplit.getStart()).andReturn(0L).anyTimes();
    expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();

    // Create a mock task attempt context for this record reader.
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    // Initialize the record reader.
    replay(inputSplit);
    replay(context);
    recordReader.initialize(inputSplit, context);

    assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f);

    // Some variables to hold the records.
    AvroKey<CharSequence> key;
    NullWritable value;

    // Read the first record.
    assertTrue("Expected at least one record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("First record had null key", key);
    assertNotNull("First record had null value", value);

    CharSequence firstString = key.datum();
    assertEquals("first", firstString.toString());

    assertTrue("getCurrentKey() returned different keys for the same record",
            key == recordReader.getCurrentKey());
    assertTrue("getCurrentValue() returned different values for the same record",
            value == recordReader.getCurrentValue());

    // Read the second record.
    assertTrue("Expected to read a second record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("Second record had null key", key);
    assertNotNull("Second record had null value", value);

    CharSequence secondString = key.datum();
    assertEquals("second", secondString.toString());

    assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(),
            0.0f);

    // There should be no more records.
    assertFalse("Expected only 2 records", recordReader.nextKeyValue());

    // Close the record reader.
    recordReader.close();

    // Verify the expected calls on the mocks.
    verify(inputSplit);
    verify(context);
}