Example usage for org.apache.hadoop.fs FSDataInputStream readFully

List of usage examples for org.apache.hadoop.fs FSDataInputStream readFully


In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream readFully.


public void readFully(long position, byte[] buffer, int offset, int length) throws IOException 

Source Link


Read bytes from the given position in the stream to the given buffer.


From source file:com.github.sadikovi.riff.FileFooter.java

License:Open Source License

 * Read footer from output stream.//ww  w.  j a  v a2s  . c om
 * Footer is assumed to be placed at the end of the stream. Seek is performed inside the method.
 * Stream is not closed after operation is complete.
 * @param in input stream
 * @param maxSize maximum stream size
 * @throws IOException
public static FileFooter readFrom(FSDataInputStream in, long maxSize) throws IOException {
    int tailOffset = 8;
    // stream size must be larger than magic + length
    if (maxSize < tailOffset) {
        throw new IOException("Invalid stream, cannot read footer: " + maxSize + " < " + tailOffset);
    // Read 8 bytes: magic 4 bytes and length of the header 4 bytes
    ByteBuffer buffer = ByteBuffer.allocate(tailOffset);
    in.readFully(maxSize - tailOffset, buffer.array(), buffer.arrayOffset(), tailOffset);

    // reconstruct magic and written bytes
    long meta = buffer.getLong();
    int magic = (int) (meta >>> 32);
    if (magic != Riff.MAGIC)
        throw new IOException("Wrong magic: " + magic + " != " + Riff.MAGIC);
    int len = (int) (meta & 0x7fffffff);
    LOG.debug("Read footer content of {} bytes", len);

    // read full footer bytes
    buffer = ByteBuffer.allocate(len);
    in.readFully(maxSize - tailOffset - len, buffer.array(), buffer.arrayOffset(), len);
    // no flip - we have not reset position
    long numRecords = buffer.getLong();
    // read file statistics
    Statistics[] fileStats = new Statistics[buffer.getInt()];
    int i = 0;
    while (i < fileStats.length) {
        fileStats[i] = Statistics.readExternal(buffer);
        LOG.debug("Read file statistics {}", fileStats[i]);
    return new FileFooter(fileStats, numRecords, buffer);

From source file:org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultTolerance2ITCase.java

License:Apache License

public void postSubmit() throws Exception {
    // We read the files and verify that we have read all the strings. If a valid-length
    // file exists we only read the file to that point. (This test should work with
    // FileSystems that support truncate() and with others as well.)

    Pattern messageRegex = Pattern.compile("message (\\d*)");

    // Keep a set of the message IDs that we read. The size must equal the read count and
    // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
    // elements twice.
    Set<Integer> readNumbers = Sets.newHashSet();
    int numRead = 0;

    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

    while (files.hasNext()) {
        LocatedFileStatus file = files.next();

        if (!file.getPath().toString().endsWith(".valid-length")) {
            int validLength = (int) file.getLen();
            if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                String validLengthString = inStream.readUTF();
                validLength = Integer.parseInt(validLengthString);
                System.out.println("VALID LENGTH: " + validLength);
            }/*  w  w  w  .j av a  2s. c o m*/
            FSDataInputStream inStream = dfs.open(file.getPath());
            byte[] buffer = new byte[validLength];
            inStream.readFully(0, buffer, 0, validLength);

            ByteArrayInputStream bais = new ByteArrayInputStream(buffer);

            InputStreamReader inStreamReader = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(inStreamReader);

            String line = br.readLine();
            while (line != null) {
                Matcher matcher = messageRegex.matcher(line);
                if (matcher.matches()) {
                    int messageId = Integer.parseInt(matcher.group(1));
                } else {
                    Assert.fail("Read line does not match expected pattern.");
                line = br.readLine();

    // Verify that we read all strings (at-least-once)
    Assert.assertEquals(NUM_STRINGS, readNumbers.size());

    // Verify that we don't have duplicates (boom!, exactly-once)
    Assert.assertEquals(NUM_STRINGS, numRead);

From source file:org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultToleranceITCase.java

License:Apache License

public void postSubmit() throws Exception {
    // We read the files and verify that we have read all the strings. If a valid-length
    // file exists we only read the file to that point. (This test should work with
    // FileSystems that support truncate() and with others as well.)

    Pattern messageRegex = Pattern.compile("message (\\d*)");

    // Keep a set of the message IDs that we read. The size must equal the read count and
    // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
    // elements twice.
    Set<Integer> readNumbers = Sets.newHashSet();

    HashSet<String> uniqMessagesRead = new HashSet<>();
    HashSet<String> messagesInCommittedFiles = new HashSet<>();

    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

    while (files.hasNext()) {
        LocatedFileStatus file = files.next();

        if (!file.getPath().toString().endsWith(".valid-length")) {
            int validLength = (int) file.getLen();
            if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                String validLengthString = inStream.readUTF();
                validLength = Integer.parseInt(validLengthString);
                System.out.println("VALID LENGTH: " + validLength);
            }//from   w  w w  . ja  v  a 2  s.c  o  m
            FSDataInputStream inStream = dfs.open(file.getPath());
            byte[] buffer = new byte[validLength];
            inStream.readFully(0, buffer, 0, validLength);

            ByteArrayInputStream bais = new ByteArrayInputStream(buffer);

            InputStreamReader inStreamReader = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(inStreamReader);

            String line = br.readLine();
            while (line != null) {
                Matcher matcher = messageRegex.matcher(line);
                if (matcher.matches()) {

                    // check that in the committed files there are no duplicates
                    if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX)
                            && !file.getPath().toString().endsWith(PENDING_SUFFIX)) {
                        if (!messagesInCommittedFiles.add(line)) {
                            Assert.fail("Duplicate entry in committed bucket.");

                    int messageId = Integer.parseInt(matcher.group(1));
                } else {
                    Assert.fail("Read line does not match expected pattern.");
                line = br.readLine();

    // Verify that we read all strings (at-least-once)
    Assert.assertEquals(NUM_STRINGS, readNumbers.size());

    // Verify that we don't have duplicates (boom!, exactly-once)
    Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size());

From source file:org.apache.hive.hcatalog.streaming.TestStreaming.java

License:Apache License

private void corruptDataFile(final String file, final Configuration conf, final int addRemoveBytes)
        throws Exception {
    Path bPath = new Path(file);
    Path cPath = new Path(bPath.getParent(), bPath.getName() + ".corrupt");
    FileSystem fs = bPath.getFileSystem(conf);
    FileStatus fileStatus = fs.getFileStatus(bPath);
    int len = addRemoveBytes == Integer.MIN_VALUE ? 0 : (int) fileStatus.getLen() + addRemoveBytes;
    byte[] buffer = new byte[len];
    FSDataInputStream fdis = fs.open(bPath);
    fdis.readFully(0, buffer, 0, (int) Math.min(fileStatus.getLen(), buffer.length));
    fdis.close();/*from   w  w w .  j a v  a 2s  .  c  o m*/
    FSDataOutputStream fdos = fs.create(cPath, true);
    fdos.write(buffer, 0, buffer.length);
    fs.delete(bPath, false);
    fs.rename(cPath, bPath);

From source file:org.apache.ignite.igfs.HadoopIgfsDualAbstractSelfTest.java

License:Apache License

 * Check how prefetch override works.//from w  ww. j  av a  2  s  . c o  m
 * @throws Exception IF failed.
public void testOpenPrefetchOverride() throws Exception {
    create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE));

    // Write enough data to the secondary file system.
    final int blockSize = IGFS_BLOCK_SIZE;

    IgfsOutputStream out = igfsSecondary.append(FILE, false);

    int totalWritten = 0;

    while (totalWritten < blockSize * 2 + chunk.length) {

        totalWritten += chunk.length;


    awaitFileClose(igfsSecondary.asSecondary(), FILE);

    // Instantiate file system with overridden "seq reads before prefetch" property.
    Configuration cfg = new Configuration();


    int seqReads = SEQ_READS_BEFORE_PREFETCH + 1;

    cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs:grid@"), seqReads);

    FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg);

    // Read the first two blocks.
    Path fsHome = new Path(PRIMARY_URI);
    Path dir = new Path(fsHome, DIR.name());
    Path subdir = new Path(dir, SUBDIR.name());
    Path file = new Path(subdir, FILE.name());

    FSDataInputStream fsIn = fs.open(file);

    final byte[] readBuf = new byte[blockSize * 2];

    fsIn.readFully(0, readBuf, 0, readBuf.length);

    // Wait for a while for prefetch to finish (if any).
    IgfsMetaManager meta = igfs.context().meta();

    IgfsFileInfo info = meta.info(meta.fileId(FILE));

    IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2);

    GridCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache()

    for (int i = 0; i < 10; i++) {
        if (dataCache.containsKey(key))


    // Remove the file from the secondary file system.
    igfsSecondary.delete(FILE, false);

    // Try reading the third block. Should fail.
    GridTestUtils.assertThrows(log, new Callable<Object>() {
        public Object call() throws Exception {
            IgfsInputStream in0 = igfs.open(FILE);

            in0.seek(blockSize * 2);

            try {
            } finally {

            return null;
    }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file");

From source file:org.apache.ignite.igfs.IgfsHadoopDualAbstractSelfTest.java

License:Apache License

 * Check how prefetch override works.// w  w  w.  j  a v a 2  s  . co m
 * @throws Exception IF failed.
public void testOpenPrefetchOverride() throws Exception {
    create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE));

    // Write enough data to the secondary file system.
    final int blockSize = IGFS_BLOCK_SIZE;

    IgfsOutputStream out = igfsSecondary.append(FILE, false);

    int totalWritten = 0;

    while (totalWritten < blockSize * 2 + chunk.length) {

        totalWritten += chunk.length;


    awaitFileClose(igfsSecondary, FILE);

    // Instantiate file system with overridden "seq reads before prefetch" property.
    Configuration cfg = new Configuration();


    int seqReads = SEQ_READS_BEFORE_PREFETCH + 1;

    cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs:grid@"), seqReads);

    FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg);

    // Read the first two blocks.
    Path fsHome = new Path(PRIMARY_URI);
    Path dir = new Path(fsHome, DIR.name());
    Path subdir = new Path(dir, SUBDIR.name());
    Path file = new Path(subdir, FILE.name());

    FSDataInputStream fsIn = fs.open(file);

    final byte[] readBuf = new byte[blockSize * 2];

    fsIn.readFully(0, readBuf, 0, readBuf.length);

    // Wait for a while for prefetch to finish (if any).
    IgfsMetaManager meta = igfs.context().meta();

    IgfsFileInfo info = meta.info(meta.fileId(FILE));

    IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2);

    GridCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache()

    for (int i = 0; i < 10; i++) {
        if (dataCache.containsKey(key))


    // Remove the file from the secondary file system.
    igfsSecondary.delete(FILE, false);

    // Try reading the third block. Should fail.
    GridTestUtils.assertThrows(log, new Callable<Object>() {
        public Object call() throws Exception {
            IgfsInputStream in0 = igfs.open(FILE);

            in0.seek(blockSize * 2);

            try {
            } finally {

            return null;
    }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file");

From source file:org.apache.ignite.internal.processors.hadoop.impl.igfs.HadoopIgfsDualAbstractSelfTest.java

License:Apache License

 * Check how prefetch override works./*  w  w  w .  ja va  2 s  .  c om*/
 * @throws Exception IF failed.
public void testOpenPrefetchOverride() throws Exception {
    create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE));

    // Write enough data to the secondary file system.
    final int blockSize = IGFS_BLOCK_SIZE;

    IgfsOutputStream out = igfsSecondary.append(FILE, false);

    int totalWritten = 0;

    while (totalWritten < blockSize * 2 + chunk.length) {

        totalWritten += chunk.length;


    awaitFileClose(igfsSecondary, FILE);

    // Instantiate file system with overridden "seq reads before prefetch" property.
    Configuration cfg = new Configuration();


    int seqReads = SEQ_READS_BEFORE_PREFETCH + 1;

    cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs@"), seqReads);

    FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg);

    // Read the first two blocks.
    Path fsHome = new Path(PRIMARY_URI);
    Path dir = new Path(fsHome, DIR.name());
    Path subdir = new Path(dir, SUBDIR.name());
    Path file = new Path(subdir, FILE.name());

    FSDataInputStream fsIn = fs.open(file);

    final byte[] readBuf = new byte[blockSize * 2];

    fsIn.readFully(0, readBuf, 0, readBuf.length);

    // Wait for a while for prefetch to finish (if any).
    IgfsMetaManager meta = igfs.context().meta();

    IgfsEntryInfo info = meta.info(meta.fileId(FILE));

    IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2);

    IgniteCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache()

    for (int i = 0; i < 10; i++) {
        if (dataCache.containsKey(key))


    // Remove the file from the secondary file system.
    igfsSecondary.delete(FILE, false);

    // Try reading the third block. Should fail.
    GridTestUtils.assertThrows(log, new Callable<Object>() {
        public Object call() throws Exception {
            IgfsInputStream in0 = igfs.open(FILE);

            in0.seek(blockSize * 2);

            try {
            } finally {

            return null;
    }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file");

From source file:org.apache.orc.impl.ReaderImpl.java

License:Apache License

 * Ensure this is an ORC file to prevent users from trying to read text
 * files or RC files as ORC files./*from   w w w . j  a va2  s.  com*/
 * @param in the file being read
 * @param path the filename for error messages
 * @param psLen the postscript length
 * @param buffer the tail of the file
 * @throws IOException
protected static void ensureOrcFooter(FSDataInputStream in, Path path, int psLen, ByteBuffer buffer)
        throws IOException {
    int magicLength = OrcFile.MAGIC.length();
    int fullLength = magicLength + 1;
    if (psLen < fullLength || buffer.remaining() < fullLength) {
        throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript length " + psLen);
    int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
    byte[] array = buffer.array();
    // now look for the magic string at the end of the postscript.
    if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
        // If it isn't there, this may be the 0.11.0 version of ORC.
        // Read the first 3 bytes of the file to check for the header
        byte[] header = new byte[magicLength];
        in.readFully(0, header, 0, magicLength);
        // if it isn't there, this isn't an ORC file
        if (!Text.decode(header, 0, magicLength).equals(OrcFile.MAGIC)) {
            throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript.");

From source file:org.apache.orc.impl.ReaderImpl.java

License:Apache License

protected OrcTail extractFileTail(FileSystem fs, Path path, long maxFileLength) throws IOException {
    FSDataInputStream file = fs.open(path);
    ByteBuffer buffer;/*from   w ww .  ja  v  a 2 s . c o m*/
    OrcProto.PostScript ps;
    OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
    long modificationTime;
    try {
        // figure out the size of the file using the option or filesystem
        long size;
        if (maxFileLength == Long.MAX_VALUE) {
            FileStatus fileStatus = fs.getFileStatus(path);
            size = fileStatus.getLen();
            modificationTime = fileStatus.getModificationTime();
        } else {
            size = maxFileLength;
            modificationTime = -1;

        //read last bytes into buffer to get PostScript
        int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
        buffer = ByteBuffer.allocate(readSize);
        assert buffer.position() == 0;
        file.readFully((size - readSize), buffer.array(), buffer.arrayOffset(), readSize);

        //read the PostScript
        //get length of PostScript
        int psLen = buffer.get(readSize - 1) & 0xff;
        ensureOrcFooter(file, path, psLen, buffer);
        int psOffset = readSize - 1 - psLen;
        ps = extractPostScript(buffer, path, psLen, psOffset);
        bufferSize = (int) ps.getCompressionBlockSize();
        codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name()));

        int footerSize = (int) ps.getFooterLength();
        int metadataSize = (int) ps.getMetadataLength();

        //check if extra bytes need to be read
        int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
        int tailSize = 1 + psLen + footerSize + metadataSize;
        if (extra > 0) {
            //more bytes need to be read, seek back to the right place and read extra bytes
            ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
            file.readFully((size - readSize - extra), extraBuf.array(),
                    extraBuf.arrayOffset() + extraBuf.position(), extra);
            //append with already read bytes
            buffer = extraBuf;
            readSize += extra;
            psOffset = readSize - 1 - psLen;
        } else {
            //footer is already in the bytes in buffer, just adjust position, length
            buffer.position(psOffset - footerSize - metadataSize);
            buffer.limit(buffer.position() + tailSize);

        int footerOffset = psOffset - footerSize;
        ByteBuffer footerBuffer = buffer.slice();
        OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize, codec, bufferSize);
    } finally {
        try {
        } catch (IOException ex) {
            LOG.error("Failed to close the file after another error", ex);

    ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining());
    return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime);

From source file:org.apache.orc.impl.RecordReaderUtils.java

License:Apache License

 * Read the list of ranges from the file.
 * @param file the file to read//from  w ww  . j ava2  s . com
 * @param base the base of the stripe
 * @param range the disk ranges within the stripe to read
 * @return the bytes read for each disk range, which is the same length as
 *    ranges
 * @throws IOException
static DiskRangeList readDiskRanges(FSDataInputStream file, HadoopShims.ZeroCopyReaderShim zcr, long base,
        DiskRangeList range, boolean doForceDirect) throws IOException {
    if (range == null)
        return null;
    DiskRangeList prev = range.prev;
    if (prev == null) {
        prev = new MutateHelper(range);
    while (range != null) {
        if (range.hasData()) {
            range = range.next;
        int len = (int) (range.getEnd() - range.getOffset());
        long off = range.getOffset();
        if (zcr != null) {
            file.seek(base + off);
            boolean hasReplaced = false;
            while (len > 0) {
                ByteBuffer partial = zcr.readBuffer(len, false);
                BufferChunk bc = new BufferChunk(partial, off);
                if (!hasReplaced) {
                    hasReplaced = true;
                } else {
                range = bc;
                int read = partial.remaining();
                len -= read;
                off += read;
        } else {
            // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
            byte[] buffer = new byte[len];
            file.readFully((base + off), buffer, 0, buffer.length);
            ByteBuffer bb = null;
            if (doForceDirect) {
                bb = ByteBuffer.allocateDirect(len);
            } else {
                bb = ByteBuffer.wrap(buffer);
            range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
        range = range.next;
    return prev.next;