edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java Source code

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java
Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.mapreduce;

import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapred.Task;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.MapContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

import edu.umn.cs.spatialHadoop.OperationsParams;
import edu.umn.cs.spatialHadoop.core.Rectangle;
import edu.umn.cs.spatialHadoop.core.Shape;
import edu.umn.cs.spatialHadoop.core.SpatialSite;
import edu.umn.cs.spatialHadoop.indexing.GlobalIndex;
import edu.umn.cs.spatialHadoop.indexing.Partition;

/**
 * @author Ahmed Eldawy
 *
 */
public class SpatialRecordReader3<V extends Shape> extends RecordReader<Partition, Iterable<V>> {

    private static final Log LOG = LogFactory.getLog(SpatialRecordReader3.class);

    /**The codec used with the input file*/
    private CompressionCodec codec;
    /**The decompressor (instance) used to decompress the input file*/
    private Decompressor decompressor;

    /** File system of the file being parsed */
    private FileSystem fs;
    /**The path of the input file to read*/
    private Path path;
    /**The offset to start reading the raw (uncompressed) file*/
    private long start;
    /**The last byte to read in the raw (uncompressed) file*/
    private long end;

    /** The boundary of the partition currently being read */
    protected Partition cellMBR;

    /**
     * The input stream that reads directly from the input file.
     * If the file is not compressed, this stream is the same as #in.
     * Otherwise, this is the raw (compressed) input stream. This stream is used
     * only to calculate the progress of the input file.
     */
    private FSDataInputStream directIn;
    /** Input stream that reads data from input file */
    private InputStream in;
    /**Determine current position to report progress*/
    private Seekable progressPosition;

    /**Used to read text lines from the input*/
    private LineReader lineReader;

    /**The shape used to parse input lines*/
    private V stockShape;

    private Text tempLine;

    /**Input query range if specified in the job configuration*/
    private Shape inputQueryRange;
    /**The MBR of the input query. Used to apply duplicate avoidance technique*/
    private Rectangle inputQueryMBR;

    private CompressionCodecFactory compressionCodecFactory;

    private ShapeIterator<V> value;

    /**
     * Number of bytes read from the input so far. This is used to determine when
     * to stop when reading from the input directly. We canno simply rely on the
     * position of the input file because LineReader might buffer some data in
     * memory without actually processing it.
     */
    private long bytesRead;

    /**
     * Context of the map task. Used to increment counters.
     */
    private Counter inputRecordsCounter;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        Configuration conf = context != null ? context.getConfiguration() : new Configuration();
        if (context != null && context instanceof MapContext)
            inputRecordsCounter = ((MapContext) context).getCounter(Task.Counter.MAP_INPUT_RECORDS);
        initialize(split, conf);
    }

    public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
        FileSplit fsplit = (FileSplit) split;
        if (compressionCodecFactory == null)
            compressionCodecFactory = new CompressionCodecFactory(conf);

        LOG.info("Open a SpatialRecordReader to split: " + split);
        this.path = fsplit.getPath();
        this.start = fsplit.getStart();
        this.end = this.start + split.getLength();
        this.fs = this.path.getFileSystem(conf);
        this.directIn = fs.open(this.path);
        codec = compressionCodecFactory.getCodec(this.path);

        if (codec != null) {
            // Input is compressed, create a decompressor to decompress it
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                // A splittable compression codec, can seek to the desired input pos
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = cIn;
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                // take pos from compressed stream as we adjusted both start and end
                // to match with the compressed file
                progressPosition = cIn;
            } else {
                // Non-splittable input, need to start from the beginning
                CompressionInputStream cIn = codec.createInputStream(directIn, decompressor);
                in = cIn;
                progressPosition = cIn;
            }
        } else {
            // Non-compressed file, seek to the desired position and use this stream
            // to get the progress and position
            directIn.seek(start);
            in = directIn;
            progressPosition = directIn;
        }
        this.stockShape = (V) OperationsParams.getShape(conf, "shape");
        this.tempLine = new Text();

        this.lineReader = new LineReader(in);
        bytesRead = 0;

        if (this.start != 0) {
            // Skip until first end-of-line reached
            bytesRead += lineReader.readLine(tempLine);
        }
        if (conf.get(SpatialInputFormat3.InputQueryRange) != null) {
            // Retrieve the input query range to apply on all records
            this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange);
            this.inputQueryMBR = this.inputQueryRange.getMBR();
        }

        // Check if there is an associated global index to read cell boundaries
        GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent());
        if (gindex == null) {
            cellMBR = new Partition();
            cellMBR.filename = path.getName();
            cellMBR.invalidate();
        } else {
            // Set from the associated partition in the global index
            for (Partition p : gindex) {
                if (p.filename.equals(this.path.getName()))
                    cellMBR = p;
            }
        }

        this.value = new ShapeIterator<V>();
        value.setShape(stockShape);
    }

    public long getPos() throws IOException {
        if (codec != null) {
            // Input is compressed. Report the progress as indicated by the
            // decompressor
            return progressPosition.getPos();
        } else {
            // Input is not compressed. Report the progress as indicated by number
            // of bytes read from the input
            return start + bytesRead;
        }
    }

    /**
     * Reads the next line from input and return true if a line was read.
     * If no more lines are available in this split, a false is returned.
     * @param value The text object to fill with the next line
     * @return <code>true</code> if a line was read; <code>false</code> otherwise.
     * @throws IOException If an error occurs while reading from disk.
     */
    protected boolean nextLine(Text value) throws IOException {
        while (getPos() <= end) {
            value.clear();

            int lineLength;
            // Read the first line from stream
            if ((lineLength = lineReader.readLine(value)) <= 0) {
                // Indicates an end of stream
                return false;
            }

            // Append the part read from stream to the part extracted from buffer
            bytesRead += lineLength;

            if (value.getLength() > 1) {
                // Read a non-empty line. Note that end-of-line character is included
                return true;
            }
        }
        // Reached end of file
        return false;
    }

    protected boolean isMatched(Shape shape) {
        // Match with the query
        if (inputQueryRange != null && (shape == null || !shape.isIntersected(inputQueryRange)))
            return false;
        // Check if we need to apply a duplicate avoidance step or not
        if (!cellMBR.isValid() || inputQueryMBR == null)
            return true;
        // Apply reference point duplicate avoidance technique
        Rectangle shapeMBR = shape.getMBR();
        double reference_x = Math.max(inputQueryMBR.x1, shapeMBR.x1);
        double reference_y = Math.max(inputQueryMBR.y1, shapeMBR.y1);
        return cellMBR.contains(reference_x, reference_y);
    }

    /**
     * Reads next shape from input and returns true. If no more shapes are left
     * in the split, a false is returned. This function first reads a line
     * by calling the method {@link #nextLine(Text)} then parses the returned
     * line by calling {@link Shape#fromText(Text)} on that line. If no stock
     * shape is set, a {@link NullPointerException} is thrown.
     * @param s A mutable shape object to update with the next value
     * @return <code>true</code> if an object was read; <code>false</code> if end-of-file was reached.
     * @throws IOException If an error happens while reading from disk
     */
    protected boolean nextShape(V s) throws IOException {
        do {
            if (!nextLine(tempLine))
                return false;
            s.fromText(tempLine);
        } while (!isMatched(s));
        return true;
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        value.setSpatialRecordReader(this);
        return value.hasNext();
    }

    @Override
    public Partition getCurrentKey() throws IOException, InterruptedException {
        return cellMBR;
    }

    @Override
    public Iterable<V> getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (start == end) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (directIn.getPos() - start) / (float) (end - start));
        }
    }

    @Override
    public void close() throws IOException {
        try {
            if (lineReader != null) {
                lineReader.close();
            } else if (in != null) {
                in.close();
            }
            lineReader = null;
            in = null;
        } finally {
            if (decompressor != null) {
                CodecPool.returnDecompressor(decompressor);
            }
        }
    }

    /**
     * An iterator that iterates over all shapes in the input file
     * @author Eldawy
     */
    public static class ShapeIterator<V extends Shape> implements Iterator<V>, Iterable<V> {
        protected V shape;
        protected V nextShape;
        private SpatialRecordReader3<V> srr;

        public ShapeIterator() {
        }

        public void setSpatialRecordReader(SpatialRecordReader3<V> srr) {
            this.srr = srr;
            try {
                if (shape != null)
                    nextShape = (V) shape.clone();
                if (nextShape != null && !srr.nextShape(nextShape))
                    nextShape = null;
            } catch (IOException e) {
                throw new RuntimeException("Error reading from file", e);
            }
        }

        public void setShape(V shape) {
            this.shape = shape;
            this.nextShape = (V) shape.clone();
            try {
                if (srr != null && !srr.nextShape(nextShape))
                    nextShape = null;
            } catch (IOException e) {
                throw new RuntimeException("Error eading from file", e);
            }
        }

        public boolean hasNext() {
            if (nextShape == null)
                return false;
            return nextShape != null;
        }

        @Override
        public V next() {
            try {
                if (nextShape == null)
                    return null;
                // Swap Shape and nextShape and read next
                V temp = shape;
                shape = nextShape;
                nextShape = temp;

                if (!srr.nextShape(nextShape))
                    nextShape = null;
                if (srr.inputRecordsCounter != null)
                    srr.inputRecordsCounter.increment(1);
                return shape;
            } catch (IOException e) {
                throw new RuntimeException("Error reading from file", e);
            }
        }

        @Override
        public Iterator<V> iterator() {
            return this;
        }

        @Override
        public void remove() {
            throw new RuntimeException("Unsupported method ShapeIterator#remove");
        }

    }

}