Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:org.apache.mrql.JsonSplitter.java

License:Apache License

JsonSplitter(String[] tags, FSDataInputStream fsin, long start, long end, DataOutputBuffer buffer) {
    in_memory = false;/*from   w  w w  .j  a  v  a 2s .  co m*/
    this.tags = tags;
    this.fsin = fsin;
    this.end = end;
    this.buffer = buffer;
    try {
        fsin.seek(start);
        this.start = (start == 0) ? start : sync(start);
        fsin.seek(this.start);
        scanner = new JSONLex(fsin);
    } catch (IOException e) {
        System.err.println("*** Cannot parse the data split: " + fsin);
    }
}

From source file:org.apache.mrql.XMLSplitter.java

License:Apache License

XMLSplitter(String[] tags, FSDataInputStream fsin, long start, long end, DataOutputBuffer buffer) {
    in_memory = false;/*from w  ww .j  av a 2 s .c om*/
    this.tags = tags;
    this.fsin = fsin;
    this.start = start;
    this.end = end;
    this.buffer = buffer;
    try {
        fsin.seek(start);
    } catch (IOException e) {
        System.err.println("*** Cannot parse the data split: " + fsin);
    }
}

From source file:org.apache.orc.impl.RecordReaderUtils.java

License:Apache License

/**
 * Read the list of ranges from the file.
 * @param file the file to read/*from  w w w . ja v  a  2  s  .c om*/
 * @param base the base of the stripe
 * @param range the disk ranges within the stripe to read
 * @return the bytes read for each disk range, which is the same length as
 *    ranges
 * @throws IOException
 */
static DiskRangeList readDiskRanges(FSDataInputStream file, HadoopShims.ZeroCopyReaderShim zcr, long base,
        DiskRangeList range, boolean doForceDirect) throws IOException {
    if (range == null)
        return null;
    DiskRangeList prev = range.prev;
    if (prev == null) {
        prev = new MutateHelper(range);
    }
    while (range != null) {
        if (range.hasData()) {
            range = range.next;
            continue;
        }
        int len = (int) (range.getEnd() - range.getOffset());
        long off = range.getOffset();
        if (zcr != null) {
            file.seek(base + off);
            boolean hasReplaced = false;
            while (len > 0) {
                ByteBuffer partial = zcr.readBuffer(len, false);
                BufferChunk bc = new BufferChunk(partial, off);
                if (!hasReplaced) {
                    range.replaceSelfWith(bc);
                    hasReplaced = true;
                } else {
                    range.insertAfter(bc);
                }
                range = bc;
                int read = partial.remaining();
                len -= read;
                off += read;
            }
        } else {
            // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
            byte[] buffer = new byte[len];
            file.readFully((base + off), buffer, 0, buffer.length);
            ByteBuffer bb = null;
            if (doForceDirect) {
                bb = ByteBuffer.allocateDirect(len);
                bb.put(buffer);
                bb.position(0);
                bb.limit(len);
            } else {
                bb = ByteBuffer.wrap(buffer);
            }
            range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
        }
        range = range.next;
    }
    return prev.next;
}

From source file:org.apache.parquet.hadoop.ColumnChunkIncReadStore.java

License:Apache License

public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException {
    FSDataInputStream in = fs.open(path);
    streams.add(in);/* w w w.  j  av a  2 s  .  c  om*/
    in.seek(metaData.getStartingPos());
    ColumnChunkIncPageReader reader = new ColumnChunkIncPageReader(metaData, descriptor, in);

    columns.put(descriptor, reader);
}

From source file:org.apache.parquet.hadoop.ParquetFileReader.java

License:Apache License

/**
 * Reads the meta data block in the footer of the file
 * @param configuration/*from  w  w  w.ja v a2s  .c o  m*/
 * @param file the parquet File
 * @param filter the filter to apply to row groups
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 */
public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file,
        MetadataFilter filter) throws IOException {
    FileSystem fileSystem = file.getPath().getFileSystem(configuration);
    FSDataInputStream f = fileSystem.open(file.getPath());
    try {
        long l = file.getLen();
        if (Log.DEBUG)
            LOG.debug("File length " + l);
        int FOOTER_LENGTH_SIZE = 4;
        if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
            throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)");
        }
        long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length;
        if (Log.DEBUG)
            LOG.debug("reading footer index at " + footerLengthIndex);

        f.seek(footerLengthIndex);
        int footerLength = readIntLittleEndian(f);
        byte[] magic = new byte[MAGIC.length];
        f.readFully(magic);
        if (!Arrays.equals(MAGIC, magic)) {
            throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail "
                    + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
        }
        long footerIndex = footerLengthIndex - footerLength;
        if (Log.DEBUG)
            LOG.debug("read footer length: " + footerLength + ", footer index: " + footerIndex);
        if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
            throw new RuntimeException("corrupted file: the footer index is not within the file");
        }
        f.seek(footerIndex);
        return converter.readParquetMetadata(f, filter);
    } finally {
        f.close();
    }
}

From source file:org.apache.parquet.hadoop.TestParquetFileWriter.java

License:Apache License

@Test
public void testAlignmentWithPadding() throws Exception {
    File testFile = temp.newFile();

    Path path = new Path(testFile.toURI());
    Configuration conf = new Configuration();

    // uses the test constructor
    ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 120, 60);

    w.start();/*from   w ww .  j a v a  2  s. c  o m*/
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();

    long firstRowGroupEnds = w.getPos(); // should be 109

    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();

    long secondRowGroupEnds = w.getPos();

    w.end(new HashMap<String, String>());

    FileSystem fs = path.getFileSystem(conf);
    long fileLen = fs.getFileStatus(path).getLen();

    FSDataInputStream data = fs.open(path);
    data.seek(fileLen - 8); // 4-byte offset + "PAR1"
    long footerLen = BytesUtils.readIntLittleEndian(data);
    long startFooter = fileLen - footerLen - 8;

    assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);

    ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());

    // verify block starting positions with padding
    assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
    assertTrue("First row group should end before the block size (120)", firstRowGroupEnds < 120);
    assertEquals("Second row group should start at the block size", 120,
            readFooter.getBlocks().get(1).getStartingPos());

    { // read first block of col #1
        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path,
                Arrays.asList(readFooter.getBlocks().get(0)),
                Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        assertNull(r.readNextRowGroup());
    }

    { // read all blocks of col #1 and #2

        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path,
                readFooter.getBlocks(),
                Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));

        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));

        pages = r.readNextRowGroup();
        assertEquals(4, pages.getRowCount());

        validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
        validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));

        assertNull(r.readNextRowGroup());
    }
    PrintFooter.main(new String[] { path.toString() });
}

From source file:org.apache.parquet.hadoop.TestParquetFileWriter.java

License:Apache License

@Test
public void testAlignmentWithNoPaddingNeeded() throws Exception {
    File testFile = temp.newFile();

    Path path = new Path(testFile.toURI());
    Configuration conf = new Configuration();

    // uses the test constructor
    ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50);

    w.start();/*from w  w  w . j av  a 2s . c om*/
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();

    long firstRowGroupEnds = w.getPos(); // should be 109

    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();

    long secondRowGroupEnds = w.getPos();

    w.end(new HashMap<String, String>());

    FileSystem fs = path.getFileSystem(conf);
    long fileLen = fs.getFileStatus(path).getLen();

    FSDataInputStream data = fs.open(path);
    data.seek(fileLen - 8); // 4-byte offset + "PAR1"
    long footerLen = BytesUtils.readIntLittleEndian(data);
    long startFooter = fileLen - footerLen - 8;

    assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);

    ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());

    // verify block starting positions with padding
    assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
    assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100);
    assertEquals("Second row group should start after no padding", 109,
            readFooter.getBlocks().get(1).getStartingPos());

    { // read first block of col #1
        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path,
                Arrays.asList(readFooter.getBlocks().get(0)),
                Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        assertNull(r.readNextRowGroup());
    }

    { // read all blocks of col #1 and #2

        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path,
                readFooter.getBlocks(),
                Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));

        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));

        pages = r.readNextRowGroup();
        assertEquals(4, pages.getRowCount());

        validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
        validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));

        assertNull(r.readNextRowGroup());
    }
    PrintFooter.main(new String[] { path.toString() });
}

From source file:org.apache.parquet.hadoop.VecParquetReader.java

License:Apache License

public static byte[] readFooterAsBytes(Vec vec) {
    FSDataInputStream f = null;
    try {/*  w  w  w  . j  a  va2 s  . c  o  m*/
        f = new FSDataInputStream(new VecDataInputStream(vec));
        final int FOOTER_LENGTH_SIZE = 4;
        if (vec.length() < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
            throw new RuntimeException("Vec doesn't represent a Parquet data (too short)");
        }
        long footerLengthIndex = vec.length() - FOOTER_LENGTH_SIZE - MAGIC.length;
        f.seek(footerLengthIndex);
        int footerLength = readIntLittleEndian(f);
        byte[] magic = new byte[MAGIC.length];
        f.readFully(magic);
        if (!Arrays.equals(MAGIC, magic)) {
            throw new RuntimeException("Vec is not a Parquet file. expected magic number at tail "
                    + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
        }
        long footerIndex = footerLengthIndex - footerLength;
        if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
            throw new RuntimeException("corrupted file: the footer index is not within the Vec");
        }
        f.seek(footerIndex);
        byte[] metadataBytes = new byte[footerLength];
        f.readFully(metadataBytes);
        return metadataBytes;
    } catch (IOException e) {
        throw new RuntimeException("Failed to read Parquet metadata", e);
    } finally {
        try {
            if (f != null)
                f.close();
        } catch (Exception e) {
            Log.warn("Failed to close Vec data input stream", e);
        }
    }
}

From source file:org.apache.pig.impl.io.BinStorageRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    start = split.getStart();//from   w w  w  .  j  a  va  2  s . c  o m
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    if (start != 0) {
        fileIn.seek(start);
    }
    in = new BufferedPositionedInputStream(fileIn, start);
    inData = new DataInputStream(in);
}

From source file:org.apache.slider.test.ContractTestUtils.java

License:Apache License

/**
 * Verify that the read at a specific offset in a stream
 * matches that expected/*from   www. j  a  v  a2 s.  c  o m*/
 * @param stm stream
 * @param fileContents original file contents
 * @param seekOff seek offset
 * @param toRead number of bytes to read
 * @throws IOException IO problems
 */
public static void verifyRead(FSDataInputStream stm, byte[] fileContents, int seekOff, int toRead)
        throws IOException {
    byte[] out = new byte[toRead];
    stm.seek(seekOff);
    stm.readFully(out);
    byte[] expected = Arrays.copyOfRange(fileContents, seekOff, seekOff + toRead);
    compareByteArrays(expected, out, toRead);
}