Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java

License:Apache License

/**
 * Decide the start of the reader.//  www. j a va2  s. c om
 */
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // if (codec instanceof CryptoCodec && job instanceof JobConf)
    // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec,
    // (JobConf) job, file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
        filePosition = fileIn;
    }
    LOG.info("Read from " + split.getPath().toString());
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));

        // Read another line as previous.

        Text current = new Text();

        int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start));

        LOG.info("Skip line " + previous + " for last split.");

        start += newSize;

        // Keep reading until a splitable point is found.
        while (start <= end) {
            newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
            if (canSplit(previous.getBytes(), current.getBytes())) {
                break;
            }
            start += newSize;
            previous.set(current.getBytes());
            LOG.info("Skip line " + previous + " for last split.");
        }

        // If exceed the end, still read one extra line.
        if (start > end) {
            if (isContinue) {
                newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
                if (!canSplit(previous.getBytes(), current.getBytes())) {
                    // Still not splitable. So skip the block.
                    start += newSize;
                    isContinue = false;
                }
            }
        }
        LOG.info("Split between: \n" + previous + "\n" + current);

        // Restart at the last read line.
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        this.pos = start;
    } else {
        Text skip = new Text();
        start += in.readLine(skip, maxLineLength, maxBytesToConsume(start));
        // start += in.readLine(skip, 0, maxBytesToConsume(start));
        LOG.info("Skip line " + skip + ". Start at " + start);
    }

    // Restart at the start index.
}

From source file:com.cloudera.hadoop.hdfs.nfs.nfs4.handlers.READHandler.java

License:Apache License

@Override
protected READResponse doHandle(NFS4Handler server, Session session, READRequest request)
        throws NFS4Exception, IOException {
    if (session.getCurrentFileHandle() == null) {
        throw new NFS4Exception(NFS4ERR_NOFILEHANDLE);
    }/*from  w w  w  .  j a va 2s .co  m*/
    int size = Math.min(request.getCount(), NFS4_MAX_RWSIZE);
    if (size < 0) {
        throw new NFS4Exception(NFS4ERR_INVAL);
    }
    FileHandle fileHandle = session.getCurrentFileHandle();
    Path path = server.getPath(fileHandle);
    FileSystem fs = session.getFileSystem();
    FSDataInputStream inputStream = server.forRead(request.getStateID(), fs, fileHandle);
    synchronized (inputStream) {
        if (inputStream.getPos() != request.getOffset()) {
            try {
                inputStream.seek(request.getOffset());
            } catch (IOException e) {
                throw new IOException(e.getMessage() + ": " + inputStream.getPos() + ", " + request.getOffset(),
                        e);
            }
            server.incrementMetric("NFS_RANDOM_READS", 1);
        }
        READResponse response = createResponse();
        byte[] data = new byte[size];
        int count = inputStream.read(data);
        long fileLength = -1;
        if (count > 0 && count != data.length
                && (request.getOffset() + count) < (fileLength = fs.getFileStatus(path).getLen())) {
            LOGGER.info("Short read " + path + " at pos = " + request.getOffset() + ", wanted " + data.length
                    + " and read " + count + ", fileLength = " + fileLength);
            server.incrementMetric("NFS_SHORT_READS", 1);
        }
        boolean eof = count < 0;
        if (eof) {
            data = new byte[0];
            count = 0;
        }
        server.incrementMetric("HDFS_BYTES_READ", count);
        response.setData(data, 0, count);
        response.setEOF(eof);
        response.setStatus(NFS4_OK);
        return response;
    }
}

From source file:com.datatorrent.lib.io.AbstractHDFSInputOperator.java

License:Open Source License

@Override
public void seek(FSDataInputStream stream, long pos) {
    try {/*from w  w w. ja  va  2  s . c o  m*/
        stream.seek(pos);
    } catch (IOException ex) {
        throw new RuntimeException(ex.getCause());
    }
}

From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java

License:Apache License

void openFile() throws IOException {
    start = split.getStart();//from www.  j  av a2  s  .c o m
    end = start + split.getLength();
    final Path file = split.getPath();
    LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
            + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    if (file.getName().endsWith(".zip")) {
        LOG.info("use ZipInputStream read file " + split.getPath());
        ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
        in = new LineReader(zin, job);
        filePosition = fileIn;
        codec = new GzipCodec();
        return;
    }
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn,
            // decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        String filename = file.getName();
        if (filename.endsWith(".tar")) {
            in = new LineReader(new TarInputStream(fileIn), job);
        } else {
            in = new LineReader(fileIn, job);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from   w  w  w .  j  av  a2 s .co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn, decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.facebook.hive.orc.InStream.java

License:Open Source License

public static void read(FSDataInputStream file, long fileOffset, byte[] array, int arrayOffset, int length)
        throws IOException {
    file.seek(fileOffset);
    file.readFully(array, arrayOffset, length);
}

From source file:com.facebook.presto.parquet.reader.MetadataReader.java

License:Apache License

public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize)
        throws IOException

{
    // Parquet File Layout:
    ///* ww  w .j a  v  a 2 s.  c o  m*/
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC

    validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length,
            "%s is not a valid Parquet File", file);
    long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length;

    inputStream.seek(metadataLengthIndex);
    int metadataLength = readIntLittleEndian(inputStream);

    byte[] magic = new byte[MAGIC.length];
    inputStream.readFully(magic);
    validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s",
            file, Arrays.toString(MAGIC), Arrays.toString(magic));

    long metadataIndex = metadataLengthIndex - metadataLength;
    validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
            "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
    inputStream.seek(metadataIndex);
    FileMetaData fileMetaData = readFileMetaData(inputStream);
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet(
                        (filePath == null && columnChunk.getFile_path() == null)
                                || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                        "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH))
                        .toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveTypeName primitiveTypeName = messageType.getType(columnPath.toArray())
                        .asPrimitiveType().getPrimitiveTypeName();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveTypeName,
                        CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings),
                        readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset,
                        metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size,
                        metaData.total_uncompressed_size);
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }

    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData,
            fileMetaData.getCreated_by()), blocks);
}

From source file:com.gruter.hadoop.customShell.CustomShell.java

License:Apache License

private InputStream forMagic(Path p, FileSystem srcFs) throws IOException {
    FSDataInputStream i = srcFs.open(p);
    switch (i.readShort()) {
    case 0x1f8b: // RFC 1952
        i.seek(0);
        return new GZIPInputStream(i);
    case 0x5345: // 'S' 'E'
        if (i.readByte() == 'Q') {
            i.close();//from   w w  w.ja  v  a2 s.com
            return new TextRecordInputStream(srcFs.getFileStatus(p));
        }
        break;
    }
    i.seek(0);
    /**
     * snappy
     */
    if (isSnappy(p.getName())) {
        return getSnappyCodec().createInputStream(i);
    }
    /**
     * end
     */
    return i;
}

From source file:com.hadoop.compression.fourmc.FourMcInputStream.java

License:BSD License

/**
 * Reads blocks index at tail of file./*from  w  w  w  . j av a  2 s.c  o m*/
 *
 * @param fs   filesystem
 * @param file path to 4mc file
 * @return block index
 * @throws IOException
 */
public static FourMcBlockIndex readIndex(FileSystem fs, Path file) throws IOException {

    long fileSize = fs.getFileStatus(file).getLen();
    if (fileSize < (12 + 20)) { // file too small
        return new FourMcBlockIndex();
    }

    FSDataInputStream indexIn = fs.open(file);

    /*
    4mc Footer:
     Footer size:        4 bytes
     Footer version:     4 byte (1)
     Block index offset: 4 bytes delta offset for each stored block, the delta between offset between previous file position and next block
     Footer size:        4 bytes (repeated to be able to read from end of file)
     MAGIC SIGNATURE:    4 bytes: "4MC\0"
     Footer checksum:    4 bytes (always in XXHASH32)
            
    */

    /**
     * jump to file tail and read-ahead last 4KB of file which should be enough in most cases
     * Improvement: we could estimate a best case compression factor of 10% and calc forecast
     *              based on filesize and blocksize, to see if better to read-head more.
     */

    int readTailSize = 4 * 1024;
    if (readTailSize > (fileSize - 12))
        readTailSize = (int) (fileSize - 12);

    indexIn.seek(fileSize - readTailSize);
    byte[] buf = new byte[readTailSize];
    readFully(indexIn, buf, 0, buf.length);
    int footerSize = getInt(buf, buf.length - 12);
    int magic = getInt(buf, buf.length - 8);
    int checksum = getInt(buf, buf.length - 4);

    if (magic != FourMcCodec.FOURMC_MAGIC) {
        throw new IOException("Invalid 4mc footer magic");
    }
    if (footerSize >= (fileSize - 12)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    // very rare case: read head was not enough! seek back and read it all
    if (footerSize > readTailSize) {
        readTailSize = footerSize;
        indexIn.seek(fileSize - readTailSize);
        buf = new byte[readTailSize];
        readFully(indexIn, buf, 0, buf.length);
    }
    indexIn.close();

    int startFooterOffset = readTailSize - footerSize;

    if (getInt(buf, startFooterOffset) != footerSize) { // size again
        throw new IOException("Invalid 4mc footer size");
    }

    if (getInt(buf, startFooterOffset + 4) != FourMcCodec.FOURMC_VERSION) { // version
        throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")");
    }

    if (checksum != Lz4Decompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    int totalBlocks = (footerSize - 20) / 4;
    FourMcBlockIndex index = new FourMcBlockIndex(totalBlocks);
    long curOffset = 0;
    for (int i = 0; i < totalBlocks; ++i) {
        curOffset += getInt(buf, startFooterOffset + 8 + (i * 4));
        index.set(i, curOffset);
    }

    return index;
}

From source file:com.hadoop.compression.fourmc.FourMzInputStream.java

License:BSD License

/**
 * Reads blocks index at tail of file./*from w  w w .j  a v  a 2s  . co m*/
 *
 * @param fs   filesystem
 * @param file path to 4mc file
 * @return block index
 * @throws IOException
 */
public static FourMzBlockIndex readIndex(FileSystem fs, Path file) throws IOException {

    long fileSize = fs.getFileStatus(file).getLen();
    if (fileSize < (12 + 20)) { // file too small
        return new FourMzBlockIndex();
    }

    FSDataInputStream indexIn = fs.open(file);

    /**
     * jump to file tail and read-ahead last 4KB of file which should be enough in most cases
     * Improvement: we could estimate a best case compression factor of 10% and calc forecast
     *              based on filesize and blocksize, to see if better to read-head more.
     */

    int readTailSize = 4 * 1024;
    if (readTailSize > (fileSize - 12))
        readTailSize = (int) (fileSize - 12);

    indexIn.seek(fileSize - readTailSize);
    byte[] buf = new byte[readTailSize];
    readFully(indexIn, buf, 0, buf.length);
    int footerSize = getInt(buf, buf.length - 12);
    int magic = getInt(buf, buf.length - 8);
    int checksum = getInt(buf, buf.length - 4);

    if (magic != FourMzCodec.FOURMZ_MAGIC) {
        throw new IOException("Invalid 4mc footer magic");
    }
    if (footerSize >= (fileSize - 12)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    // very rare case: read head was not enough! seek back and read it all
    if (footerSize > readTailSize) {
        readTailSize = footerSize;
        indexIn.seek(fileSize - readTailSize);
        buf = new byte[readTailSize];
        readFully(indexIn, buf, 0, buf.length);
    }
    indexIn.close();

    int startFooterOffset = readTailSize - footerSize;

    if (getInt(buf, startFooterOffset) != footerSize) { // size again
        throw new IOException("Invalid 4mc footer size");
    }

    if (getInt(buf, startFooterOffset + 4) != FourMzCodec.FOURMZ_VERSION) { // version
        throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")");
    }

    if (checksum != ZstdDecompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) {
        throw new IOException("Invalid 4mc footer checksum");
    }

    int totalBlocks = (footerSize - 20) / 4;
    FourMzBlockIndex index = new FourMzBlockIndex(totalBlocks);
    long curOffset = 0;
    for (int i = 0; i < totalBlocks; ++i) {
        curOffset += getInt(buf, startFooterOffset + 8 + (i * 4));
        index.set(i, curOffset);
    }

    return index;
}