List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:io.hops.erasure_coding.FileStripeReader.java
License:Apache License
@Override public InputStream buildOneInput(int locationIndex, long offsetInBlock, FileSystem srcFs, Path srcFile, FileStatus srcStat, FileSystem parityFs, Path parityFile, FileStatus parityStat) throws IOException { final long blockSize = srcStat.getBlockSize(); LOG.info("buildOneInput srcfile " + srcFile + " srclen " + srcStat.getLen() + " parityfile " + parityFile + " paritylen " + parityStat.getLen() + " stripeindex " + stripeStartIdx + " locationindex " + locationIndex + " offsetinblock " + offsetInBlock); if (locationIndex < codec.parityLength) { return this.getParityFileInput(locationIndex, parityFile, parityFs, parityStat, offsetInBlock); } else {/* www. j a va2 s. c o m*/ // Dealing with a src file here. int blockIdxInStripe = locationIndex - codec.parityLength; int blockIdx = (int) (codec.stripeLength * stripeStartIdx + blockIdxInStripe); long offset = blockSize * blockIdx + offsetInBlock; if (offset >= srcStat.getLen()) { LOG.info("Using zeros for " + srcFile + ":" + offset + " for location " + locationIndex); return new RaidUtils.ZeroInputStream(blockSize * (blockIdx + 1)); } else { LOG.info("Opening " + srcFile + ":" + offset + " for location " + locationIndex); FSDataInputStream s = fs.open(srcFile, conf.getInt("io.file.buffer.size", 64 * 1024)); s.seek(offset); return s; } } }
From source file:io.hops.erasure_coding.ReedSolomonDecoder.java
License:Apache License
protected int[] buildInputs(FileSystem fs, Path srcFile, FileSystem parityFs, Path parityFile, boolean fixSource, long errorOffset, FSDataInputStream[] inputs) throws IOException { LOG.info("Building inputs to recover block starting at " + errorOffset); try {//from w w w. ja v a 2 s . co m FileStatus srcStat = fs.getFileStatus(srcFile); FileStatus parityStat = fs.getFileStatus(parityFile); long blockSize = srcStat.getBlockSize(); long blockIdx = (int) (errorOffset / blockSize); long stripeIdx; if (fixSource) { stripeIdx = blockIdx / stripeSize; } else { stripeIdx = blockIdx / paritySize; } LOG.info("FileSize = " + srcStat.getLen() + ", blockSize = " + blockSize + ", blockIdx = " + blockIdx + ", stripeIdx = " + stripeIdx); ArrayList<Integer> erasedLocations = new ArrayList<Integer>(); // First open streams to the parity blocks. for (int i = 0; i < paritySize; i++) { long offset = blockSize * (stripeIdx * paritySize + i); if ((!fixSource) && offset == errorOffset) { LOG.info(parityFile + ":" + offset + " is known to have error, adding zeros as input " + i); inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(offset + blockSize)); erasedLocations.add(i); } else if (offset > parityStat.getLen()) { LOG.info(parityFile + ":" + offset + " is past file size, adding zeros as input " + i); inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(offset + blockSize)); } else { FSDataInputStream in = parityFs.open(parityFile, conf.getInt("io.file.buffer.size", 64 * 1024)); in.seek(offset); LOG.info("Adding " + parityFile + ":" + offset + " as input " + i); inputs[i] = in; } } // Now open streams to the data blocks. for (int i = paritySize; i < paritySize + stripeSize; i++) { long offset = blockSize * (stripeIdx * stripeSize + i - paritySize); if (fixSource && offset == errorOffset) { LOG.info(srcFile + ":" + offset + " is known to have error, adding zeros as input " + i); inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(offset + blockSize)); erasedLocations.add(i); } else if (offset > srcStat.getLen()) { LOG.info(srcFile + ":" + offset + " is past file size, adding zeros as input " + i); inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(offset + blockSize)); } else { FSDataInputStream in = fs.open(srcFile, conf.getInt("io.file.buffer.size", 64 * 1024)); in.seek(offset); LOG.info("Adding " + srcFile + ":" + offset + " as input " + i); inputs[i] = in; } } if (erasedLocations.size() > paritySize) { String msg = "Too many erased locations: " + erasedLocations.size(); LOG.error(msg); throw new IOException(msg); } int[] locs = new int[erasedLocations.size()]; for (int i = 0; i < locs.length; i++) { locs[i] = erasedLocations.get(i); } return locs; } catch (IOException e) { RaidUtils.closeStreams(inputs); throw e; } }
From source file:io.hops.erasure_coding.StripeReader.java
License:Apache License
protected InputStream getParityFileInput(int locationIndex, Path parityFile, FileSystem parityFs, FileStatus parityStat, long offsetInBlock) throws IOException { // Dealing with a parity file here. int parityBlockIdx = (int) (codec.parityLength * stripeStartIdx + locationIndex); long offset = parityStat.getBlockSize() * parityBlockIdx + offsetInBlock; assert (offset < parityStat.getLen()); LOG.info("Opening " + parityFile + ":" + offset + " for location " + locationIndex); FSDataInputStream s = parityFs.open(parityFile, conf.getInt("io.file.buffer.size", 64 * 1024)); s.seek(offset); return s;// w w w. ja va 2s. c o m }
From source file:io.hops.erasure_coding.XORDecoder.java
License:Apache License
@Override protected long fixErasedBlockImpl(FileSystem fs, Path srcFile, FileSystem parityFs, Path parityFile, boolean fixSource, long blockSize, long errorOffset, long limit, boolean partial, OutputStream out, Progressable reporter, CRC32 crc) throws IOException { if (partial) { throw new IOException("We don't support partial reconstruction"); }//from w w w .jav a 2 s . c o m LOG.info("Fixing block at " + srcFile + ":" + errorOffset + ", limit " + limit); if (crc != null) { crc.reset(); } FileStatus srcStat = fs.getFileStatus(srcFile); FSDataInputStream[] inputs = new FSDataInputStream[stripeSize + this.codec.parityLength]; try { long errorBlockOffset = (errorOffset / blockSize) * blockSize; long[] srcOffsets = stripeOffsets(errorOffset, blockSize, fixSource); for (int i = 0; i < srcOffsets.length; i++) { if (fixSource && srcOffsets[i] == errorBlockOffset) { inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(blockSize)); LOG.info("Using zeros at " + srcFile + ":" + errorBlockOffset); continue; } if (srcOffsets[i] < srcStat.getLen()) { FSDataInputStream in = fs.open(srcFile); in.seek(srcOffsets[i]); inputs[i] = in; } else { inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(blockSize)); LOG.info("Using zeros at " + srcFile + ":" + errorBlockOffset); } } if (fixSource) { FSDataInputStream parityFileIn = parityFs.open(parityFile); parityFileIn.seek(parityOffset(errorOffset, blockSize)); inputs[inputs.length - 1] = parityFileIn; } else { inputs[inputs.length - 1] = new FSDataInputStream(new RaidUtils.ZeroInputStream(blockSize)); LOG.info("Using zeros at " + parityFile + ":" + errorBlockOffset); } } catch (IOException e) { RaidUtils.closeStreams(inputs); throw e; } int boundedBufferCapacity = 1; ParallelStreamReader parallelReader = new ParallelStreamReader(reporter, inputs, bufSize, parallelism, boundedBufferCapacity, blockSize); parallelReader.start(); try { // Loop while the number of skipped + written bytes is less than the max. long written; for (written = 0; written < limit;) { ParallelStreamReader.ReadResult readResult; try { readResult = parallelReader.getReadResult(); } catch (InterruptedException e) { throw new IOException("Interrupted while waiting for read result"); } // Cannot tolerate any IO errors. IOException readEx = readResult.getException(); if (readEx != null) { throw readEx; } int toWrite = (int) Math.min((long) bufSize, limit - written); XOREncoder.xor(readResult.readBufs, writeBufs[0]); out.write(writeBufs[0], 0, toWrite); if (crc != null) { crc.update(writeBufs[0], 0, toWrite); } written += toWrite; } return written; } finally { // Inputs will be closed by parallelReader.shutdown(). parallelReader.shutdown(); } }
From source file:io.prestosql.parquet.reader.MetadataReader.java
License:Apache License
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize) throws IOException { // Parquet File Layout: ///*from ww w . j a va2 s. co m*/ // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length; inputStream.seek(metadataLengthIndex); int metadataLength = readIntLittleEndian(inputStream); byte[] magic = new byte[MAGIC.length]; inputStream.readFully(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); inputStream.seek(metadataIndex); FileMetaData fileMetaData = readFileMetaData(inputStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH)) .toArray(String[]::new); ColumnPath columnPath = ColumnPath.get(path); PrimitiveTypeName primitiveTypeName = messageType.getType(columnPath.toArray()) .asPrimitiveType().getPrimitiveTypeName(); ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveTypeName, CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); }
From source file:io.sanfran.wikiTrends.extraction.hadoop.FileNameLineRecordReader.java
License:Open Source License
public FileNameLineRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.LineRecordReader.maxlength", Integer.MAX_VALUE); fileName = split.getPath().getName(); start = split.getStart();/*from w ww .j a va 2 s .co m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:kogiri.common.hadoop.io.reader.fasta.FastaRawReadReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); this.filename = file.getName(); this.firstRead = true; // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try {//from ww w. j a va 2s. c o m LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); } else { if (this.start != 0) { fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } // skip lines until we meet new read start while (this.start < this.end) { Text skipText = new Text(); long newSize = this.in.readLine(skipText, this.maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength)); if (newSize == 0) { // EOF this.hasNextRead = false; this.pos = this.end; break; } if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) { this.prevLine = skipText; this.prevSize = newSize; this.hasNextRead = true; this.pos = this.start; break; } this.start += newSize; if (this.start >= this.end) { // EOF this.hasNextRead = false; this.pos = this.end; break; } } this.key = null; this.value = null; }
From source file:kogiri.common.hadoop.io.reader.fasta.FastaReadDescriptionReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); this.filename = file.getName(); this.firstRead = true; // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try {//from ww w . ja v a2 s . c o m LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); } else { if (this.start != 0) { fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } // skip lines until we meet new record start while (this.start < this.end) { Text skipText = new Text(); long newSize = this.in.readLine(skipText, this.maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength)); if (newSize == 0) { // EOF this.hasNextRecord = false; this.pos = this.end; break; } if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) { this.prevLine = skipText; this.prevSize = newSize; this.hasNextRecord = true; this.pos = this.start; break; } this.start += newSize; if (this.start >= this.end) { // EOF this.hasNextRecord = false; this.pos = this.end; break; } } this.key = null; this.value = null; }
From source file:lennard.PiRecordReader.java
License:Apache License
public PiRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from www. j av a 2s . co m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:libra.common.hadoop.io.reader.fasta.FastaKmerReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); this.kmersize = FastaKmerInputFormat.getKmerSize(conf); this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = this.compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try {/* ww w .j ava2 s. c om*/ LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); boolean inTheMiddle = false; if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), conf); } else { if (this.start != 0) { this.start--; fileIn.seek(this.start); inTheMiddle = true; } this.in = new LineReader(fileIn, conf); } this.buffer = new Text(); if (inTheMiddle) { // find new start line this.start += this.in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start)); // back off FSDataInputStream fileIn2 = fs.open(file); fileIn2.seek(this.start - 1000); LineReader in2 = new LineReader(fileIn2, conf); Text tempLine = new Text(); long curpos = this.start - 1000; while (curpos < this.start) { curpos += in2.readLine(tempLine, 0, (int) (this.start - curpos)); } if (tempLine.charAt(0) == READ_DELIMITER) { // clean start this.buffer.clear(); } else { // leave k-1 seq in the buffer String seq = tempLine.toString().trim(); String left = seq.substring(seq.length() - this.kmersize + 1); this.buffer.set(left); } in2.close(); } this.pos = this.start; this.key = null; this.value = null; }