Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:diamondmapreduce.NLineRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;

    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf.getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();//w  w  w. ja  va  2s .c o  m
    end = start + split.getLength();
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if (start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }
    in = new LineReader(filein, conf);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:dz.lab.hdfs.SeekReadFile.java

/**
 * @param args//from   www . j  av  a 2 s . c o m
 */
public static void main(String[] args) throws IOException {
    Path fileToRead = new Path("/tmp/quotes.csv");
    // read configuration from core-site.xml available in the classpath (under /resources)
    FileSystem fs = FileSystem.get(new Configuration());

    FSDataInputStream input = null;
    try {
        // start at position 0
        input = fs.open(fileToRead);
        System.out.print("start position=" + input.getPos() + ":");
        IOUtils.copyBytes(input, System.out, 4096, false);

        // seek to position 11
        input.seek(11);
        System.out.print("start position=" + input.getPos() + ":");
        IOUtils.copyBytes(input, System.out, 4096, false);

        // seek back to position 0
        input.seek(11);
        System.out.print("start position=" + input.getPos() + ":");
        IOUtils.copyBytes(input, System.out, 4096, false);
    } finally {
        IOUtils.closeStream(input);
    }
}

From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java

License:Open Source License

/**
 * Copies a part of a file from a remote file system (e.g., HDFS) to a local
 * file. Returns a path to a local temporary file.
 * //from   w ww  . ja va 2  s .  c o  m
 * @param conf
 * @param split
 * @return
 * @throws IOException
 */
public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException {
    FileSystem fs = split.getPath().getFileSystem(conf);

    // Special case of a local file. Skip copying the file
    if (fs instanceof LocalFileSystem && split.getStart() == 0)
        return split.getPath().toUri().getPath();

    // Length of input file. We do not depend on split.length because it is
    // not
    // set by input format for performance reason. Setting it in the input
    // format would cost a lot of time because it runs on the client machine
    // while the record reader runs on slave nodes in parallel
    long length = fs.getFileStatus(split.getPath()).getLen();

    FSDataInputStream in = fs.open(split.getPath());
    in.seek(split.getStart());
    ReadableByteChannel rbc = Channels.newChannel(in);

    // Prepare output file for write
    File tempFile = File.createTempFile(split.getPath().getName(), "tmp");
    FileOutputStream out = new FileOutputStream(tempFile);

    out.getChannel().transferFrom(rbc, 0, length);

    rbc.close();
    out.close();
    return tempFile.getAbsolutePath();
}

From source file:edu.umn.cs.spatialHadoop.hdf.HDFFile.java

License:Open Source License

/**
 * Initializes a new HDF file from an input stream. This stream should not
 * be closed as long as the HDF file is used. Closing this HDFFile will
 * also close the underlying stream./*  w w  w. jav a 2  s  .c  o  m*/
 * @throws IOException 
 * 
 */
public HDFFile(FSDataInputStream inStream) throws IOException {
    this.inStream = inStream;
    byte[] signature = new byte[4];
    inStream.readFully(signature);
    if (!Arrays.equals(signature, HDFMagicNumber))
        throw new RuntimeException("Not a valid HDF file");
    int blockSize = inStream.readUnsignedShort();
    int nextBlock;
    do {
        // Keep track of the location of the next block
        nextBlock = inStream.readInt();
        this.readBlock(blockSize);
        if (nextBlock != 0) {
            inStream.seek(nextBlock);
            blockSize = inStream.readShort();
        }
    } while (nextBlock > 0);
}

From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java

License:Open Source License

/**
 * Perform a selection query that retrieves all points in the given range.
 * The range is specified in the two-dimensional array positions.
 * @param in//from www.j a  v  a 2 s .  c om
 * @param query_mbr
 * @param output
 * @return number of matched records
 * @throws IOException
 */
public static int selectionQuery(FSDataInputStream in, Rectangle query_mbr, ResultCollector<PointValue> output)
        throws IOException {
    long treeStartPosition = in.getPos();
    int numOfResults = 0;
    int resolution = in.readInt();
    short fillValue = in.readShort();
    int cardinality = in.readInt();
    long[] timestamps = new long[cardinality];
    for (int i = 0; i < cardinality; i++)
        timestamps[i] = in.readLong();
    Vector<Integer> selectedStarts = new Vector<Integer>();
    Vector<Integer> selectedEnds = new Vector<Integer>();
    StockQuadTree stockQuadTree = getOrCreateStockQuadTree(resolution);
    // Nodes to be searched. Contains node positions in the array of nodes
    Stack<Integer> nodes_2b_searched = new Stack<Integer>();
    nodes_2b_searched.add(0); // Root node (ID=1)
    Rectangle node_mbr = new Rectangle();
    while (!nodes_2b_searched.isEmpty()) {
        int node_pos = nodes_2b_searched.pop();
        stockQuadTree.getNodeMBR(node_pos, node_mbr);
        if (query_mbr.contains(node_mbr)) {
            // Add this node to the selection list and stop this branch
            if (!selectedEnds.isEmpty()
                    && selectedEnds.lastElement() == stockQuadTree.nodesStartPosition[node_pos]) {
                // Merge with an adjacent range
                selectedEnds.set(selectedEnds.size() - 1, stockQuadTree.nodesEndPosition[node_pos]);
            } else {
                // add a new range
                selectedStarts.add(stockQuadTree.nodesStartPosition[node_pos]);
                selectedEnds.add(stockQuadTree.nodesEndPosition[node_pos]);
            }
            numOfResults += stockQuadTree.nodesEndPosition[node_pos]
                    - stockQuadTree.nodesStartPosition[node_pos];
        } else if (query_mbr.intersects(node_mbr)) {
            int first_child_id = stockQuadTree.nodesID[node_pos] * 4 + 0;
            int first_child_pos = Arrays.binarySearch(stockQuadTree.nodesID, first_child_id);
            if (first_child_pos < 0) {
                // No children. Hit a leaf node
                // Scan and add matching points only
                java.awt.Point record_coords = new Point();
                for (int record_pos = stockQuadTree.nodesStartPosition[node_pos]; record_pos < stockQuadTree.nodesEndPosition[node_pos]; record_pos++) {
                    stockQuadTree.getRecordCoords(record_pos, record_coords);
                    if (query_mbr.contains(record_coords)) {
                        // matched a record.
                        if (!selectedEnds.isEmpty() && selectedEnds.lastElement() == record_pos) {
                            // Merge with an adjacent range
                            selectedEnds.set(selectedEnds.size() - 1, record_pos + 1);
                        } else {
                            // Add a new range of unit width
                            selectedStarts.add(record_pos);
                            selectedEnds.add(record_pos + 1);
                        }
                        numOfResults++;
                    }
                }
            } else {
                // Non-leaf node. Add all children to the list of nodes to search
                // Add in reverse order to the stack so that results come in sorted order
                nodes_2b_searched.add(first_child_pos + 3);
                nodes_2b_searched.add(first_child_pos + 2);
                nodes_2b_searched.add(first_child_pos + 1);
                nodes_2b_searched.add(first_child_pos + 0);
            }
        }
    }
    if (output != null) {
        PointValue returnValue = new PointValue();
        long dataStartPosition = treeStartPosition + getValuesStartOffset(cardinality);
        // Return all values in the selected ranges
        for (int iRange = 0; iRange < selectedStarts.size(); iRange++) {
            int treeStart = selectedStarts.get(iRange);
            int treeEnd = selectedEnds.get(iRange);
            long startPosition = dataStartPosition + selectedStarts.get(iRange) * cardinality * 2;
            in.seek(startPosition);
            for (int treePos = treeStart; treePos < treeEnd; treePos++) {
                // Retrieve the coords for the point at treePos
                stockQuadTree.getRecordCoords(treePos, returnValue);
                // Read all entries at current position
                for (int iValue = 0; iValue < cardinality; iValue++) {
                    short value = in.readShort();
                    if (value != fillValue) {
                        returnValue.value = value;
                        returnValue.timestamp = timestamps[iValue];
                        output.collect(returnValue);
                    }
                }
            }
        }
    }
    return numOfResults;
}

From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java

License:Open Source License

/**
 * Perform a selection query that retrieves all points in the given range.
 * The range is specified in the two-dimensional array positions. 
 * @param in/*from www. ja  v a2 s. c om*/
 * @param query_mbr
 * @return
 * @throws IOException
 */
public static Node aggregateQuery(FSDataInputStream in, Rectangle query_mbr) throws IOException {
    long treeStartPosition = in.getPos();
    Node result = new Node();
    int numOfSelectedRecords = 0;
    int resolution = in.readInt();
    short fillValue = in.readShort();
    int cardinality = in.readInt();
    final Vector<Integer> selectedNodesPos = new Vector<Integer>();
    final Vector<Integer> selectedStarts = new Vector<Integer>();
    final Vector<Integer> selectedEnds = new Vector<Integer>();
    StockQuadTree stockQuadTree = getOrCreateStockQuadTree(resolution);
    // Nodes to be searched. Contains node positions in the array of nodes
    Stack<Integer> nodes_2b_searched = new Stack<Integer>();
    nodes_2b_searched.add(0); // Root node (ID=1)
    Rectangle node_mbr = new Rectangle();
    while (!nodes_2b_searched.isEmpty()) {
        int node_pos = nodes_2b_searched.pop();
        stockQuadTree.getNodeMBR(node_pos, node_mbr);
        if (query_mbr.contains(node_mbr)) {
            // Add this node to the selection list and stop this branch
            selectedNodesPos.add(node_pos);
        } else if (query_mbr.intersects(node_mbr)) {
            int first_child_id = stockQuadTree.nodesID[node_pos] * 4 + 0;
            int first_child_pos = Arrays.binarySearch(stockQuadTree.nodesID, first_child_id);
            if (first_child_pos < 0) {
                // No children. Hit a leaf node
                // Scan and add matching points only
                java.awt.Point record_coords = new Point();
                for (int record_pos = stockQuadTree.nodesStartPosition[node_pos]; record_pos < stockQuadTree.nodesEndPosition[node_pos]; record_pos++) {
                    stockQuadTree.getRecordCoords(record_pos, record_coords);
                    if (query_mbr.contains(record_coords)) {
                        // matched a record.
                        if (!selectedEnds.isEmpty() && selectedEnds.lastElement() == record_pos) {
                            // Merge with an adjacent range
                            selectedEnds.set(selectedEnds.size() - 1, record_pos + 1);
                        } else {
                            // Add a new range of unit width
                            selectedStarts.add(record_pos);
                            selectedEnds.add(record_pos + 1);
                        }
                        numOfSelectedRecords++;
                    }
                }
            } else {
                // Non-leaf node. Add all children to the list of nodes to search
                // Add in reverse order to the stack so that results come in sorted order
                nodes_2b_searched.add(first_child_pos + 3);
                nodes_2b_searched.add(first_child_pos + 2);
                nodes_2b_searched.add(first_child_pos + 1);
                nodes_2b_searched.add(first_child_pos + 0);
            }
        }
    }
    // Result 1: Accumulate all values
    // Sort disk offsets to eliminate backward seeks
    if (!selectedStarts.isEmpty()) {
        LOG.debug("Aggregate query selected " + selectedNodesPos.size() + " nodes and " + numOfSelectedRecords
                + " records");

        final IndexedSortable sortable = new IndexedSortable() {
            @Override
            public int compare(int i, int j) {
                return selectedStarts.get(i) - selectedStarts.get(j);
            }

            @Override
            public void swap(int i, int j) {
                int temp = selectedStarts.get(i);
                selectedStarts.set(i, selectedStarts.get(j));
                selectedStarts.set(j, temp);

                temp = selectedEnds.get(i);
                selectedEnds.set(i, selectedEnds.get(j));
                selectedEnds.set(j, temp);
            }
        };
        new QuickSort().sort(sortable, 0, selectedStarts.size());

        long dataStartPosition = getValuesStartOffset(cardinality);
        Point resultCoords = new Point();
        // Return all values in the selected ranges
        for (int iRange = 0; iRange < selectedStarts.size(); iRange++) {
            int treeStart = selectedStarts.get(iRange);
            int treeEnd = selectedEnds.get(iRange);
            long startPosition = dataStartPosition + selectedStarts.get(iRange) * cardinality * 2;
            in.seek(startPosition);
            for (int treePos = treeStart; treePos < treeEnd; treePos++) {
                // Retrieve the coords for the point at treePos
                stockQuadTree.getRecordCoords(treePos, resultCoords);
                // Read all entries at current position
                for (int iValue = 0; iValue < cardinality; iValue++) {
                    short value = in.readShort();
                    if (value != fillValue)
                        result.accumulate(value);
                }
            }
        }

    }

    // Result 2: Accumulate all nodes
    if (!selectedNodesPos.isEmpty()) {
        long nodesStartPosition = treeStartPosition + getNodesStartOffset(resolution, cardinality);
        // Sort node positions to eliminate backward seeks
        IndexedSortable nodeSortable = new IndexedSortable() {
            @Override
            public int compare(int i, int j) {
                return selectedNodesPos.get(i) - selectedNodesPos.get(j);
            }

            @Override
            public void swap(int i, int j) {
                int temp = selectedNodesPos.get(i);
                selectedNodesPos.set(i, selectedNodesPos.get(j));
                selectedNodesPos.set(j, temp);
            }
        };
        new QuickSort().sort(nodeSortable, 0, selectedNodesPos.size());

        Node selectedNode = new Node();
        for (int node_pos : selectedNodesPos) {
            long nodePosition = nodesStartPosition + node_pos * NodeSize;
            in.seek(nodePosition);
            selectedNode.readFields(in);
            result.accumulate(selectedNode);
        }
    }
    return result;
}

From source file:edu.umn.cs.spatialHadoop.util.FileUtil.java

License:Open Source License

/**
 * Copies a part of a file from a remote file system (e.g., HDFS) to a local
 * file. Returns a path to a local temporary file.
 * /* www . java2 s .  com*/
 * @param conf
 * @param split
 * @return
 * @throws IOException
 */
public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException {
    FileSystem fs = split.getPath().getFileSystem(conf);

    // Special case of a local file. Skip copying the file
    if (fs instanceof LocalFileSystem && split.getStart() == 0)
        return split.getPath().toUri().getPath();

    File destFile = File.createTempFile(split.getPath().getName(), "tmp");
    // Special handling for HTTP files for more efficiency
    /*if (fs instanceof HTTPFileSystem && split.getStart() == 0) {
      URL website = split.getPath().toUri().toURL();
      ReadableByteChannel rbc = Channels.newChannel(website.openStream());
      FileOutputStream fos = new FileOutputStream(destFile);
      fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
      fos.close();
      return destFile.getAbsolutePath();
    }*/

    // Length of input file. We do not depend on split.length because it is
    // not
    // set by input format for performance reason. Setting it in the input
    // format would cost a lot of time because it runs on the client machine
    // while the record reader runs on slave nodes in parallel
    long length = fs.getFileStatus(split.getPath()).getLen();

    FSDataInputStream in = fs.open(split.getPath());
    in.seek(split.getStart());
    ReadableByteChannel rbc = Channels.newChannel(in);

    // Prepare output file for write
    FileOutputStream out = new FileOutputStream(destFile);

    out.getChannel().transferFrom(rbc, 0, length);

    in.close();
    out.close();
    return destFile.getAbsolutePath();
}

From source file:fi.tkk.ics.hadoop.bam.BAMRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being. 
    if (isInitialized)
        close();//from ww w .  ja v  a2 s. c  om
    isInitialized = true;

    final Configuration conf = ContextUtil.getConfiguration(ctx);

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf));

    in.seek(0);
    bci = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    final long virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
        final long recordStart = virtualStart & 0xffff;
        System.err.println(
                "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart);
    }
}

From source file:fi.tkk.ics.hadoop.bam.BCFRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    isBGZF = spl instanceof FileVirtualSplit;
    if (isBGZF) {
        final FileVirtualSplit split = (FileVirtualSplit) spl;

        final Path file = split.getPath();
        final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx));

        final FSDataInputStream inFile = fs.open(file);

        bci = new BlockCompressedInputStream(inFile);
        in = new PositionalBufferedStream(bci);
        initContigDict();/*from ww w .j a v  a 2  s . c  om*/

        inFile.seek(0);
        bci = new BlockCompressedInputStream(
                new WrapSeekable<FSDataInputStream>(inFile, fs.getFileStatus(file).getLen(), file));

        final long virtualStart = split.getStartVirtualOffset(), virtualEnd = split.getEndVirtualOffset();

        this.fileStart = virtualStart >>> 16;
        this.length = (virtualEnd >>> 16) - fileStart;

        bci.seek(virtualStart);

        // Since PositionalBufferedStream does its own buffering, we have to
        // prevent it from going too far by using a BGZFLimitingStream. It
        // also allows nextKeyValue() to simply check for EOF instead of
        // looking at virtualEnd.
        in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd));
    } else {
        final FileSplit split = (FileSplit) spl;

        this.fileStart = split.getStart();
        this.length = split.getLength();

        final Path file = split.getPath();

        in = new PositionalBufferedStream(file.getFileSystem(ContextUtil.getConfiguration(ctx)).open(file));

        initContigDict();

        in.skip(fileStart - in.getPosition());
    }
}

From source file:fi.tkk.ics.hadoop.bam.VCFRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    final FileSplit split = (FileSplit) spl;

    this.length = split.getLength();

    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx));

    final FSDataInputStream ins = fs.open(file);

    reader = new AsciiLineReader(ins);
    it = new AsciiLineReaderIterator(reader);

    final Object h = codec.readHeader(it);
    if (!(h instanceof FeatureCodecHeader) || !(((FeatureCodecHeader) h).getHeaderValue() instanceof VCFHeader))
        throw new IOException("No VCF header found in " + file);

    final VCFHeader header = (VCFHeader) ((FeatureCodecHeader) h).getHeaderValue();

    contigDict.clear();/*from  w w  w  . j  a v  a  2 s  . c  o m*/
    int i = 0;
    for (final VCFContigHeaderLine contig : header.getContigLines())
        contigDict.put(contig.getID(), i++);

    // Note that we create a new reader here, so reader.getPosition() is 0 at
    // start regardless of the value of start. Hence getProgress() and
    // nextKeyValue() don't need to use start at all.
    final long start = split.getStart();
    if (start != 0) {
        ins.seek(start - 1);
        reader = new AsciiLineReader(ins);
        reader.readLine(); // NOTE: skip incomplete line!
        it = new AsciiLineReaderIterator(reader);
    } else { // it seems that newer versions of the reader peek ahead one more line from the input
        long current_pos = it.getPosition();
        ins.seek(0);
        reader = new AsciiLineReader(ins);
        it = new AsciiLineReaderIterator(reader);
        while (it.hasNext() && it.getPosition() <= current_pos && it.peek().startsWith("#")) {
            it.next();
        }
        if (!it.hasNext() || it.getPosition() > current_pos)
            throw new IOException("Empty VCF file " + file);
    }
}