List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java
License:Apache License
/** * Decide the start of the reader.// www. j a va2 s. c om */ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // if (codec instanceof CryptoCodec && job instanceof JobConf) // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec, // (JobConf) job, file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } LOG.info("Read from " + split.getPath().toString()); // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // Read another line as previous. Text current = new Text(); int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start)); LOG.info("Skip line " + previous + " for last split."); start += newSize; // Keep reading until a splitable point is found. while (start <= end) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (canSplit(previous.getBytes(), current.getBytes())) { break; } start += newSize; previous.set(current.getBytes()); LOG.info("Skip line " + previous + " for last split."); } // If exceed the end, still read one extra line. if (start > end) { if (isContinue) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (!canSplit(previous.getBytes(), current.getBytes())) { // Still not splitable. So skip the block. start += newSize; isContinue = false; } } } LOG.info("Split between: \n" + previous + "\n" + current); // Restart at the last read line. fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } this.pos = start; } else { Text skip = new Text(); start += in.readLine(skip, maxLineLength, maxBytesToConsume(start)); // start += in.readLine(skip, 0, maxBytesToConsume(start)); LOG.info("Skip line " + skip + ". Start at " + start); } // Restart at the start index. }
From source file:com.cloudera.hadoop.hdfs.nfs.nfs4.handlers.READHandler.java
License:Apache License
@Override protected READResponse doHandle(NFS4Handler server, Session session, READRequest request) throws NFS4Exception, IOException { if (session.getCurrentFileHandle() == null) { throw new NFS4Exception(NFS4ERR_NOFILEHANDLE); }/*from w w w . j a va 2s .co m*/ int size = Math.min(request.getCount(), NFS4_MAX_RWSIZE); if (size < 0) { throw new NFS4Exception(NFS4ERR_INVAL); } FileHandle fileHandle = session.getCurrentFileHandle(); Path path = server.getPath(fileHandle); FileSystem fs = session.getFileSystem(); FSDataInputStream inputStream = server.forRead(request.getStateID(), fs, fileHandle); synchronized (inputStream) { if (inputStream.getPos() != request.getOffset()) { try { inputStream.seek(request.getOffset()); } catch (IOException e) { throw new IOException(e.getMessage() + ": " + inputStream.getPos() + ", " + request.getOffset(), e); } server.incrementMetric("NFS_RANDOM_READS", 1); } READResponse response = createResponse(); byte[] data = new byte[size]; int count = inputStream.read(data); long fileLength = -1; if (count > 0 && count != data.length && (request.getOffset() + count) < (fileLength = fs.getFileStatus(path).getLen())) { LOGGER.info("Short read " + path + " at pos = " + request.getOffset() + ", wanted " + data.length + " and read " + count + ", fileLength = " + fileLength); server.incrementMetric("NFS_SHORT_READS", 1); } boolean eof = count < 0; if (eof) { data = new byte[0]; count = 0; } server.incrementMetric("HDFS_BYTES_READ", count); response.setData(data, 0, count); response.setEOF(eof); response.setStatus(NFS4_OK); return response; } }
From source file:com.datatorrent.lib.io.AbstractHDFSInputOperator.java
License:Open Source License
@Override public void seek(FSDataInputStream stream, long pos) { try {/*from w w w. ja va 2 s . c o m*/ stream.seek(pos); } catch (IOException ex) { throw new RuntimeException(ex.getCause()); } }
From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java
License:Apache License
void openFile() throws IOException { start = split.getStart();//from www. j av a2 s .c o m end = start + split.getLength(); final Path file = split.getPath(); LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing=" + fileEncodeing + " " + split.getStart() + ":" + split.getLength()); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); if (file.getName().endsWith(".zip")) { LOG.info("use ZipInputStream read file " + split.getPath()); ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing)); in = new LineReader(zin, job); filePosition = fileIn; codec = new GzipCodec(); return; } if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, // decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { String filename = file.getName(); if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); String filename = file.getName(); if (filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(fileIn), job); } else { in = new LineReader(fileIn, job); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w w w . j av a2 s .co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.facebook.hive.orc.InStream.java
License:Open Source License
public static void read(FSDataInputStream file, long fileOffset, byte[] array, int arrayOffset, int length) throws IOException { file.seek(fileOffset); file.readFully(array, arrayOffset, length); }
From source file:com.facebook.presto.parquet.reader.MetadataReader.java
License:Apache License
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize) throws IOException { // Parquet File Layout: ///* ww w .j a v a 2 s. c o m*/ // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length; inputStream.seek(metadataLengthIndex); int metadataLength = readIntLittleEndian(inputStream); byte[] magic = new byte[MAGIC.length]; inputStream.readFully(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); inputStream.seek(metadataIndex); FileMetaData fileMetaData = readFileMetaData(inputStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH)) .toArray(String[]::new); ColumnPath columnPath = ColumnPath.get(path); PrimitiveTypeName primitiveTypeName = messageType.getType(columnPath.toArray()) .asPrimitiveType().getPrimitiveTypeName(); ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveTypeName, CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); }
From source file:com.gruter.hadoop.customShell.CustomShell.java
License:Apache License
private InputStream forMagic(Path p, FileSystem srcFs) throws IOException { FSDataInputStream i = srcFs.open(p); switch (i.readShort()) { case 0x1f8b: // RFC 1952 i.seek(0); return new GZIPInputStream(i); case 0x5345: // 'S' 'E' if (i.readByte() == 'Q') { i.close();//from w w w.ja v a2 s.com return new TextRecordInputStream(srcFs.getFileStatus(p)); } break; } i.seek(0); /** * snappy */ if (isSnappy(p.getName())) { return getSnappyCodec().createInputStream(i); } /** * end */ return i; }
From source file:com.hadoop.compression.fourmc.FourMcInputStream.java
License:BSD License
/** * Reads blocks index at tail of file./*from w w w . j av a 2 s.c o m*/ * * @param fs filesystem * @param file path to 4mc file * @return block index * @throws IOException */ public static FourMcBlockIndex readIndex(FileSystem fs, Path file) throws IOException { long fileSize = fs.getFileStatus(file).getLen(); if (fileSize < (12 + 20)) { // file too small return new FourMcBlockIndex(); } FSDataInputStream indexIn = fs.open(file); /* 4mc Footer: Footer size: 4 bytes Footer version: 4 byte (1) Block index offset: 4 bytes delta offset for each stored block, the delta between offset between previous file position and next block Footer size: 4 bytes (repeated to be able to read from end of file) MAGIC SIGNATURE: 4 bytes: "4MC\0" Footer checksum: 4 bytes (always in XXHASH32) */ /** * jump to file tail and read-ahead last 4KB of file which should be enough in most cases * Improvement: we could estimate a best case compression factor of 10% and calc forecast * based on filesize and blocksize, to see if better to read-head more. */ int readTailSize = 4 * 1024; if (readTailSize > (fileSize - 12)) readTailSize = (int) (fileSize - 12); indexIn.seek(fileSize - readTailSize); byte[] buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); int footerSize = getInt(buf, buf.length - 12); int magic = getInt(buf, buf.length - 8); int checksum = getInt(buf, buf.length - 4); if (magic != FourMcCodec.FOURMC_MAGIC) { throw new IOException("Invalid 4mc footer magic"); } if (footerSize >= (fileSize - 12)) { throw new IOException("Invalid 4mc footer checksum"); } // very rare case: read head was not enough! seek back and read it all if (footerSize > readTailSize) { readTailSize = footerSize; indexIn.seek(fileSize - readTailSize); buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); } indexIn.close(); int startFooterOffset = readTailSize - footerSize; if (getInt(buf, startFooterOffset) != footerSize) { // size again throw new IOException("Invalid 4mc footer size"); } if (getInt(buf, startFooterOffset + 4) != FourMcCodec.FOURMC_VERSION) { // version throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")"); } if (checksum != Lz4Decompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) { throw new IOException("Invalid 4mc footer checksum"); } int totalBlocks = (footerSize - 20) / 4; FourMcBlockIndex index = new FourMcBlockIndex(totalBlocks); long curOffset = 0; for (int i = 0; i < totalBlocks; ++i) { curOffset += getInt(buf, startFooterOffset + 8 + (i * 4)); index.set(i, curOffset); } return index; }
From source file:com.hadoop.compression.fourmc.FourMzInputStream.java
License:BSD License
/** * Reads blocks index at tail of file./*from w w w .j a v a 2s . co m*/ * * @param fs filesystem * @param file path to 4mc file * @return block index * @throws IOException */ public static FourMzBlockIndex readIndex(FileSystem fs, Path file) throws IOException { long fileSize = fs.getFileStatus(file).getLen(); if (fileSize < (12 + 20)) { // file too small return new FourMzBlockIndex(); } FSDataInputStream indexIn = fs.open(file); /** * jump to file tail and read-ahead last 4KB of file which should be enough in most cases * Improvement: we could estimate a best case compression factor of 10% and calc forecast * based on filesize and blocksize, to see if better to read-head more. */ int readTailSize = 4 * 1024; if (readTailSize > (fileSize - 12)) readTailSize = (int) (fileSize - 12); indexIn.seek(fileSize - readTailSize); byte[] buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); int footerSize = getInt(buf, buf.length - 12); int magic = getInt(buf, buf.length - 8); int checksum = getInt(buf, buf.length - 4); if (magic != FourMzCodec.FOURMZ_MAGIC) { throw new IOException("Invalid 4mc footer magic"); } if (footerSize >= (fileSize - 12)) { throw new IOException("Invalid 4mc footer checksum"); } // very rare case: read head was not enough! seek back and read it all if (footerSize > readTailSize) { readTailSize = footerSize; indexIn.seek(fileSize - readTailSize); buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); } indexIn.close(); int startFooterOffset = readTailSize - footerSize; if (getInt(buf, startFooterOffset) != footerSize) { // size again throw new IOException("Invalid 4mc footer size"); } if (getInt(buf, startFooterOffset + 4) != FourMzCodec.FOURMZ_VERSION) { // version throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")"); } if (checksum != ZstdDecompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) { throw new IOException("Invalid 4mc footer checksum"); } int totalBlocks = (footerSize - 20) / 4; FourMzBlockIndex index = new FourMzBlockIndex(totalBlocks); long curOffset = 0; for (int i = 0; i < totalBlocks; ++i) { curOffset += getInt(buf, startFooterOffset + 8 + (i * 4)); index.set(i, curOffset); } return index; }