List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:org.apache.mrql.JsonSplitter.java
License:Apache License
JsonSplitter(String[] tags, FSDataInputStream fsin, long start, long end, DataOutputBuffer buffer) { in_memory = false;/*from w w w .j a v a 2s . co m*/ this.tags = tags; this.fsin = fsin; this.end = end; this.buffer = buffer; try { fsin.seek(start); this.start = (start == 0) ? start : sync(start); fsin.seek(this.start); scanner = new JSONLex(fsin); } catch (IOException e) { System.err.println("*** Cannot parse the data split: " + fsin); } }
From source file:org.apache.mrql.XMLSplitter.java
License:Apache License
XMLSplitter(String[] tags, FSDataInputStream fsin, long start, long end, DataOutputBuffer buffer) { in_memory = false;/*from w ww .j av a 2 s .c om*/ this.tags = tags; this.fsin = fsin; this.start = start; this.end = end; this.buffer = buffer; try { fsin.seek(start); } catch (IOException e) { System.err.println("*** Cannot parse the data split: " + fsin); } }
From source file:org.apache.orc.impl.RecordReaderUtils.java
License:Apache License
/** * Read the list of ranges from the file. * @param file the file to read/*from w w w . ja v a 2 s .c om*/ * @param base the base of the stripe * @param range the disk ranges within the stripe to read * @return the bytes read for each disk range, which is the same length as * ranges * @throws IOException */ static DiskRangeList readDiskRanges(FSDataInputStream file, HadoopShims.ZeroCopyReaderShim zcr, long base, DiskRangeList range, boolean doForceDirect) throws IOException { if (range == null) return null; DiskRangeList prev = range.prev; if (prev == null) { prev = new MutateHelper(range); } while (range != null) { if (range.hasData()) { range = range.next; continue; } int len = (int) (range.getEnd() - range.getOffset()); long off = range.getOffset(); if (zcr != null) { file.seek(base + off); boolean hasReplaced = false; while (len > 0) { ByteBuffer partial = zcr.readBuffer(len, false); BufferChunk bc = new BufferChunk(partial, off); if (!hasReplaced) { range.replaceSelfWith(bc); hasReplaced = true; } else { range.insertAfter(bc); } range = bc; int read = partial.remaining(); len -= read; off += read; } } else { // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless. byte[] buffer = new byte[len]; file.readFully((base + off), buffer, 0, buffer.length); ByteBuffer bb = null; if (doForceDirect) { bb = ByteBuffer.allocateDirect(len); bb.put(buffer); bb.position(0); bb.limit(len); } else { bb = ByteBuffer.wrap(buffer); } range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset())); } range = range.next; } return prev.next; }
From source file:org.apache.parquet.hadoop.ColumnChunkIncReadStore.java
License:Apache License
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException { FSDataInputStream in = fs.open(path); streams.add(in);/* w w w. j av a 2 s . c om*/ in.seek(metaData.getStartingPos()); ColumnChunkIncPageReader reader = new ColumnChunkIncPageReader(metaData, descriptor, in); columns.put(descriptor, reader); }
From source file:org.apache.parquet.hadoop.ParquetFileReader.java
License:Apache License
/** * Reads the meta data block in the footer of the file * @param configuration/*from w w w.ja v a2s .c o m*/ * @param file the parquet File * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file */ public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException { FileSystem fileSystem = file.getPath().getFileSystem(configuration); FSDataInputStream f = fileSystem.open(file.getPath()); try { long l = file.getLen(); if (Log.DEBUG) LOG.debug("File length " + l); int FOOTER_LENGTH_SIZE = 4; if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)"); } long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length; if (Log.DEBUG) LOG.debug("reading footer index at " + footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; if (Log.DEBUG) LOG.debug("read footer length: " + footerLength + ", footer index: " + footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file"); } f.seek(footerIndex); return converter.readParquetMetadata(f, filter); } finally { f.close(); } }
From source file:org.apache.parquet.hadoop.TestParquetFileWriter.java
License:Apache License
@Test public void testAlignmentWithPadding() throws Exception { File testFile = temp.newFile(); Path path = new Path(testFile.toURI()); Configuration conf = new Configuration(); // uses the test constructor ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 120, 60); w.start();/*from w ww . j a v a 2 s. c o m*/ w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); long firstRowGroupEnds = w.getPos(); // should be 109 w.startBlock(4); w.startColumn(C1, 7, CODEC); w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); long secondRowGroupEnds = w.getPos(); w.end(new HashMap<String, String>()); FileSystem fs = path.getFileSystem(conf); long fileLen = fs.getFileStatus(path).getLen(); FSDataInputStream data = fs.open(path); data.seek(fileLen - 8); // 4-byte offset + "PAR1" long footerLen = BytesUtils.readIntLittleEndian(data); long startFooter = fileLen - footerLen - 8; assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter); ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path); assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size()); assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize()); assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize()); assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize()); HashSet<Encoding> expectedEncoding = new HashSet<Encoding>(); expectedEncoding.add(PLAIN); expectedEncoding.add(BIT_PACKED); assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings()); // verify block starting positions with padding assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos()); assertTrue("First row group should end before the block size (120)", firstRowGroupEnds < 120); assertEquals("Second row group should start at the block size", 120, readFooter.getBlocks().get(1).getStartingPos()); { // read first block of col #1 ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1))); PageReadStore pages = r.readNextRowGroup(); assertEquals(3, pages.getRowCount()); validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1)); validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1)); assertNull(r.readNextRowGroup()); } { // read all blocks of col #1 and #2 ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2))); PageReadStore pages = r.readNextRowGroup(); assertEquals(3, pages.getRowCount()); validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1)); validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1)); validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2)); validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2)); validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2)); pages = r.readNextRowGroup(); assertEquals(4, pages.getRowCount()); validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3)); validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4)); assertNull(r.readNextRowGroup()); } PrintFooter.main(new String[] { path.toString() }); }
From source file:org.apache.parquet.hadoop.TestParquetFileWriter.java
License:Apache License
@Test public void testAlignmentWithNoPaddingNeeded() throws Exception { File testFile = temp.newFile(); Path path = new Path(testFile.toURI()); Configuration conf = new Configuration(); // uses the test constructor ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50); w.start();/*from w w w . j av a 2s . c om*/ w.startBlock(3); w.startColumn(C1, 5, CODEC); long c1Starts = w.getPos(); w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c1Ends = w.getPos(); w.startColumn(C2, 6, CODEC); long c2Starts = w.getPos(); w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); long c2Ends = w.getPos(); w.endBlock(); long firstRowGroupEnds = w.getPos(); // should be 109 w.startBlock(4); w.startColumn(C1, 7, CODEC); w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(C2, 8, CODEC); w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); long secondRowGroupEnds = w.getPos(); w.end(new HashMap<String, String>()); FileSystem fs = path.getFileSystem(conf); long fileLen = fs.getFileStatus(path).getLen(); FSDataInputStream data = fs.open(path); data.seek(fileLen - 8); // 4-byte offset + "PAR1" long footerLen = BytesUtils.readIntLittleEndian(data); long startFooter = fileLen - footerLen - 8; assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter); ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path); assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size()); assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize()); assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize()); assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize()); HashSet<Encoding> expectedEncoding = new HashSet<Encoding>(); expectedEncoding.add(PLAIN); expectedEncoding.add(BIT_PACKED); assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings()); // verify block starting positions with padding assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos()); assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100); assertEquals("Second row group should start after no padding", 109, readFooter.getBlocks().get(1).getStartingPos()); { // read first block of col #1 ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1))); PageReadStore pages = r.readNextRowGroup(); assertEquals(3, pages.getRowCount()); validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1)); validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1)); assertNull(r.readNextRowGroup()); } { // read all blocks of col #1 and #2 ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2))); PageReadStore pages = r.readNextRowGroup(); assertEquals(3, pages.getRowCount()); validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1)); validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1)); validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2)); validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2)); validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2)); pages = r.readNextRowGroup(); assertEquals(4, pages.getRowCount()); validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3)); validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4)); assertNull(r.readNextRowGroup()); } PrintFooter.main(new String[] { path.toString() }); }
From source file:org.apache.parquet.hadoop.VecParquetReader.java
License:Apache License
public static byte[] readFooterAsBytes(Vec vec) { FSDataInputStream f = null; try {/* w w w . j a va2 s . c o m*/ f = new FSDataInputStream(new VecDataInputStream(vec)); final int FOOTER_LENGTH_SIZE = 4; if (vec.length() < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException("Vec doesn't represent a Parquet data (too short)"); } long footerLengthIndex = vec.length() - FOOTER_LENGTH_SIZE - MAGIC.length; f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException("Vec is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the Vec"); } f.seek(footerIndex); byte[] metadataBytes = new byte[footerLength]; f.readFully(metadataBytes); return metadataBytes; } catch (IOException e) { throw new RuntimeException("Failed to read Parquet metadata", e); } finally { try { if (f != null) f.close(); } catch (Exception e) { Log.warn("Failed to close Vec data input stream", e); } } }
From source file:org.apache.pig.impl.io.BinStorageRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart();//from w w w . j a va 2 s . c o m end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (start != 0) { fileIn.seek(start); } in = new BufferedPositionedInputStream(fileIn, start); inData = new DataInputStream(in); }
From source file:org.apache.slider.test.ContractTestUtils.java
License:Apache License
/** * Verify that the read at a specific offset in a stream * matches that expected/*from www. j a v a2 s. c o m*/ * @param stm stream * @param fileContents original file contents * @param seekOff seek offset * @param toRead number of bytes to read * @throws IOException IO problems */ public static void verifyRead(FSDataInputStream stm, byte[] fileContents, int seekOff, int toRead) throws IOException { byte[] out = new byte[toRead]; stm.seek(seekOff); stm.readFully(out); byte[] expected = Arrays.copyOfRange(fileContents, seekOff, seekOff + toRead); compareByteArrays(expected, out, toRead); }