Example usage for org.apache.hadoop.fs FSDataInputStream readFully

List of usage examples for org.apache.hadoop.fs FSDataInputStream readFully

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream readFully.

Prototype

@Override
public void readFully(long position, byte[] buffer, int offset, int length) throws IOException 

Source Link

Document

Read bytes from the given position in the stream to the given buffer.

Usage

From source file:com.github.sadikovi.riff.FileFooter.java

License:Open Source License

/**
 * Read footer from output stream.//ww  w.  j a  v a2s  . c om
 * Footer is assumed to be placed at the end of the stream. Seek is performed inside the method.
 * Stream is not closed after operation is complete.
 * @param in input stream
 * @param maxSize maximum stream size
 * @throws IOException
 */
public static FileFooter readFrom(FSDataInputStream in, long maxSize) throws IOException {
    int tailOffset = 8;
    // stream size must be larger than magic + length
    if (maxSize < tailOffset) {
        throw new IOException("Invalid stream, cannot read footer: " + maxSize + " < " + tailOffset);
    }
    // Read 8 bytes: magic 4 bytes and length of the header 4 bytes
    ByteBuffer buffer = ByteBuffer.allocate(tailOffset);
    in.readFully(maxSize - tailOffset, buffer.array(), buffer.arrayOffset(), tailOffset);

    // reconstruct magic and written bytes
    long meta = buffer.getLong();
    int magic = (int) (meta >>> 32);
    if (magic != Riff.MAGIC)
        throw new IOException("Wrong magic: " + magic + " != " + Riff.MAGIC);
    int len = (int) (meta & 0x7fffffff);
    LOG.debug("Read footer content of {} bytes", len);

    // read full footer bytes
    buffer = ByteBuffer.allocate(len);
    in.readFully(maxSize - tailOffset - len, buffer.array(), buffer.arrayOffset(), len);
    // no flip - we have not reset position
    long numRecords = buffer.getLong();
    // read file statistics
    Statistics[] fileStats = new Statistics[buffer.getInt()];
    int i = 0;
    while (i < fileStats.length) {
        fileStats[i] = Statistics.readExternal(buffer);
        LOG.debug("Read file statistics {}", fileStats[i]);
        ++i;
    }
    return new FileFooter(fileStats, numRecords, buffer);
}

From source file:org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultTolerance2ITCase.java

License:Apache License

@Override
public void postSubmit() throws Exception {
    // We read the files and verify that we have read all the strings. If a valid-length
    // file exists we only read the file to that point. (This test should work with
    // FileSystems that support truncate() and with others as well.)

    Pattern messageRegex = Pattern.compile("message (\\d*)");

    // Keep a set of the message IDs that we read. The size must equal the read count and
    // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
    // elements twice.
    Set<Integer> readNumbers = Sets.newHashSet();
    int numRead = 0;

    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

    while (files.hasNext()) {
        LocatedFileStatus file = files.next();

        if (!file.getPath().toString().endsWith(".valid-length")) {
            int validLength = (int) file.getLen();
            if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                String validLengthString = inStream.readUTF();
                validLength = Integer.parseInt(validLengthString);
                System.out.println("VALID LENGTH: " + validLength);
            }/*  w  w  w  .j av a  2s. c o m*/
            FSDataInputStream inStream = dfs.open(file.getPath());
            byte[] buffer = new byte[validLength];
            inStream.readFully(0, buffer, 0, validLength);
            inStream.close();

            ByteArrayInputStream bais = new ByteArrayInputStream(buffer);

            InputStreamReader inStreamReader = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(inStreamReader);

            String line = br.readLine();
            while (line != null) {
                Matcher matcher = messageRegex.matcher(line);
                if (matcher.matches()) {
                    numRead++;
                    int messageId = Integer.parseInt(matcher.group(1));
                    readNumbers.add(messageId);
                } else {
                    Assert.fail("Read line does not match expected pattern.");
                }
                line = br.readLine();
            }
            br.close();
            inStreamReader.close();
            bais.close();
        }
    }

    // Verify that we read all strings (at-least-once)
    Assert.assertEquals(NUM_STRINGS, readNumbers.size());

    // Verify that we don't have duplicates (boom!, exactly-once)
    Assert.assertEquals(NUM_STRINGS, numRead);
}

From source file:org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultToleranceITCase.java

License:Apache License

@Override
public void postSubmit() throws Exception {
    // We read the files and verify that we have read all the strings. If a valid-length
    // file exists we only read the file to that point. (This test should work with
    // FileSystems that support truncate() and with others as well.)

    Pattern messageRegex = Pattern.compile("message (\\d*)");

    // Keep a set of the message IDs that we read. The size must equal the read count and
    // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
    // elements twice.
    Set<Integer> readNumbers = Sets.newHashSet();

    HashSet<String> uniqMessagesRead = new HashSet<>();
    HashSet<String> messagesInCommittedFiles = new HashSet<>();

    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

    while (files.hasNext()) {
        LocatedFileStatus file = files.next();

        if (!file.getPath().toString().endsWith(".valid-length")) {
            int validLength = (int) file.getLen();
            if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                String validLengthString = inStream.readUTF();
                validLength = Integer.parseInt(validLengthString);
                System.out.println("VALID LENGTH: " + validLength);
            }//from   w  w w  . ja  v  a 2  s.c  o  m
            FSDataInputStream inStream = dfs.open(file.getPath());
            byte[] buffer = new byte[validLength];
            inStream.readFully(0, buffer, 0, validLength);
            inStream.close();

            ByteArrayInputStream bais = new ByteArrayInputStream(buffer);

            InputStreamReader inStreamReader = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(inStreamReader);

            String line = br.readLine();
            while (line != null) {
                Matcher matcher = messageRegex.matcher(line);
                if (matcher.matches()) {
                    uniqMessagesRead.add(line);

                    // check that in the committed files there are no duplicates
                    if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX)
                            && !file.getPath().toString().endsWith(PENDING_SUFFIX)) {
                        if (!messagesInCommittedFiles.add(line)) {
                            Assert.fail("Duplicate entry in committed bucket.");
                        }
                    }

                    int messageId = Integer.parseInt(matcher.group(1));
                    readNumbers.add(messageId);
                } else {
                    Assert.fail("Read line does not match expected pattern.");
                }
                line = br.readLine();
            }
            br.close();
            inStreamReader.close();
            bais.close();
        }
    }

    // Verify that we read all strings (at-least-once)
    Assert.assertEquals(NUM_STRINGS, readNumbers.size());

    // Verify that we don't have duplicates (boom!, exactly-once)
    Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size());
}

From source file:org.apache.hive.hcatalog.streaming.TestStreaming.java

License:Apache License

private void corruptDataFile(final String file, final Configuration conf, final int addRemoveBytes)
        throws Exception {
    Path bPath = new Path(file);
    Path cPath = new Path(bPath.getParent(), bPath.getName() + ".corrupt");
    FileSystem fs = bPath.getFileSystem(conf);
    FileStatus fileStatus = fs.getFileStatus(bPath);
    int len = addRemoveBytes == Integer.MIN_VALUE ? 0 : (int) fileStatus.getLen() + addRemoveBytes;
    byte[] buffer = new byte[len];
    FSDataInputStream fdis = fs.open(bPath);
    fdis.readFully(0, buffer, 0, (int) Math.min(fileStatus.getLen(), buffer.length));
    fdis.close();/*from   w  w w .  j a v  a 2s  .  c  o m*/
    FSDataOutputStream fdos = fs.create(cPath, true);
    fdos.write(buffer, 0, buffer.length);
    fdos.close();
    fs.delete(bPath, false);
    fs.rename(cPath, bPath);
}

From source file:org.apache.ignite.igfs.HadoopIgfsDualAbstractSelfTest.java

License:Apache License

/**
 * Check how prefetch override works.//from w  ww. j  av a  2  s  . c o  m
 *
 * @throws Exception IF failed.
 */
public void testOpenPrefetchOverride() throws Exception {
    create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE));

    // Write enough data to the secondary file system.
    final int blockSize = IGFS_BLOCK_SIZE;

    IgfsOutputStream out = igfsSecondary.append(FILE, false);

    int totalWritten = 0;

    while (totalWritten < blockSize * 2 + chunk.length) {
        out.write(chunk);

        totalWritten += chunk.length;
    }

    out.close();

    awaitFileClose(igfsSecondary.asSecondary(), FILE);

    // Instantiate file system with overridden "seq reads before prefetch" property.
    Configuration cfg = new Configuration();

    cfg.addResource(U.resolveIgniteUrl(PRIMARY_CFG));

    int seqReads = SEQ_READS_BEFORE_PREFETCH + 1;

    cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs:grid@"), seqReads);

    FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg);

    // Read the first two blocks.
    Path fsHome = new Path(PRIMARY_URI);
    Path dir = new Path(fsHome, DIR.name());
    Path subdir = new Path(dir, SUBDIR.name());
    Path file = new Path(subdir, FILE.name());

    FSDataInputStream fsIn = fs.open(file);

    final byte[] readBuf = new byte[blockSize * 2];

    fsIn.readFully(0, readBuf, 0, readBuf.length);

    // Wait for a while for prefetch to finish (if any).
    IgfsMetaManager meta = igfs.context().meta();

    IgfsFileInfo info = meta.info(meta.fileId(FILE));

    IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2);

    GridCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache()
            .cache(igfs.configuration().getDataCacheName());

    for (int i = 0; i < 10; i++) {
        if (dataCache.containsKey(key))
            break;
        else
            U.sleep(100);
    }

    fsIn.close();

    // Remove the file from the secondary file system.
    igfsSecondary.delete(FILE, false);

    // Try reading the third block. Should fail.
    GridTestUtils.assertThrows(log, new Callable<Object>() {
        @Override
        public Object call() throws Exception {
            IgfsInputStream in0 = igfs.open(FILE);

            in0.seek(blockSize * 2);

            try {
                in0.read(readBuf);
            } finally {
                U.closeQuiet(in0);
            }

            return null;
        }
    }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file");
}

From source file:org.apache.ignite.igfs.IgfsHadoopDualAbstractSelfTest.java

License:Apache License

/**
 * Check how prefetch override works.// w  w  w.  j  a v a 2  s  . co m
 *
 * @throws Exception IF failed.
 */
public void testOpenPrefetchOverride() throws Exception {
    create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE));

    // Write enough data to the secondary file system.
    final int blockSize = IGFS_BLOCK_SIZE;

    IgfsOutputStream out = igfsSecondary.append(FILE, false);

    int totalWritten = 0;

    while (totalWritten < blockSize * 2 + chunk.length) {
        out.write(chunk);

        totalWritten += chunk.length;
    }

    out.close();

    awaitFileClose(igfsSecondary, FILE);

    // Instantiate file system with overridden "seq reads before prefetch" property.
    Configuration cfg = new Configuration();

    cfg.addResource(U.resolveIgniteUrl(PRIMARY_CFG));

    int seqReads = SEQ_READS_BEFORE_PREFETCH + 1;

    cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs:grid@"), seqReads);

    FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg);

    // Read the first two blocks.
    Path fsHome = new Path(PRIMARY_URI);
    Path dir = new Path(fsHome, DIR.name());
    Path subdir = new Path(dir, SUBDIR.name());
    Path file = new Path(subdir, FILE.name());

    FSDataInputStream fsIn = fs.open(file);

    final byte[] readBuf = new byte[blockSize * 2];

    fsIn.readFully(0, readBuf, 0, readBuf.length);

    // Wait for a while for prefetch to finish (if any).
    IgfsMetaManager meta = igfs.context().meta();

    IgfsFileInfo info = meta.info(meta.fileId(FILE));

    IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2);

    GridCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache()
            .cache(igfs.configuration().getDataCacheName());

    for (int i = 0; i < 10; i++) {
        if (dataCache.containsKey(key))
            break;
        else
            U.sleep(100);
    }

    fsIn.close();

    // Remove the file from the secondary file system.
    igfsSecondary.delete(FILE, false);

    // Try reading the third block. Should fail.
    GridTestUtils.assertThrows(log, new Callable<Object>() {
        @Override
        public Object call() throws Exception {
            IgfsInputStream in0 = igfs.open(FILE);

            in0.seek(blockSize * 2);

            try {
                in0.read(readBuf);
            } finally {
                U.closeQuiet(in0);
            }

            return null;
        }
    }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file");
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.igfs.HadoopIgfsDualAbstractSelfTest.java

License:Apache License

/**
 * Check how prefetch override works./*  w  w  w .  ja va  2 s  .  c om*/
 *
 * @throws Exception IF failed.
 */
public void testOpenPrefetchOverride() throws Exception {
    create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE));

    // Write enough data to the secondary file system.
    final int blockSize = IGFS_BLOCK_SIZE;

    IgfsOutputStream out = igfsSecondary.append(FILE, false);

    int totalWritten = 0;

    while (totalWritten < blockSize * 2 + chunk.length) {
        out.write(chunk);

        totalWritten += chunk.length;
    }

    out.close();

    awaitFileClose(igfsSecondary, FILE);

    // Instantiate file system with overridden "seq reads before prefetch" property.
    Configuration cfg = new Configuration();

    cfg.addResource(U.resolveIgniteUrl(PRIMARY_CFG));

    int seqReads = SEQ_READS_BEFORE_PREFETCH + 1;

    cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs@"), seqReads);

    FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg);

    // Read the first two blocks.
    Path fsHome = new Path(PRIMARY_URI);
    Path dir = new Path(fsHome, DIR.name());
    Path subdir = new Path(dir, SUBDIR.name());
    Path file = new Path(subdir, FILE.name());

    FSDataInputStream fsIn = fs.open(file);

    final byte[] readBuf = new byte[blockSize * 2];

    fsIn.readFully(0, readBuf, 0, readBuf.length);

    // Wait for a while for prefetch to finish (if any).
    IgfsMetaManager meta = igfs.context().meta();

    IgfsEntryInfo info = meta.info(meta.fileId(FILE));

    IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2);

    IgniteCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache()
            .jcache(igfs.configuration().getDataCacheConfiguration().getName());

    for (int i = 0; i < 10; i++) {
        if (dataCache.containsKey(key))
            break;
        else
            U.sleep(100);
    }

    fsIn.close();

    // Remove the file from the secondary file system.
    igfsSecondary.delete(FILE, false);

    // Try reading the third block. Should fail.
    GridTestUtils.assertThrows(log, new Callable<Object>() {
        @Override
        public Object call() throws Exception {
            IgfsInputStream in0 = igfs.open(FILE);

            in0.seek(blockSize * 2);

            try {
                in0.read(readBuf);
            } finally {
                U.closeQuiet(in0);
            }

            return null;
        }
    }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file");
}

From source file:org.apache.orc.impl.ReaderImpl.java

License:Apache License

/**
 * Ensure this is an ORC file to prevent users from trying to read text
 * files or RC files as ORC files./*from   w w w . j  a va2  s.  com*/
 * @param in the file being read
 * @param path the filename for error messages
 * @param psLen the postscript length
 * @param buffer the tail of the file
 * @throws IOException
 */
protected static void ensureOrcFooter(FSDataInputStream in, Path path, int psLen, ByteBuffer buffer)
        throws IOException {
    int magicLength = OrcFile.MAGIC.length();
    int fullLength = magicLength + 1;
    if (psLen < fullLength || buffer.remaining() < fullLength) {
        throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript length " + psLen);
    }
    int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
    byte[] array = buffer.array();
    // now look for the magic string at the end of the postscript.
    if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
        // If it isn't there, this may be the 0.11.0 version of ORC.
        // Read the first 3 bytes of the file to check for the header
        byte[] header = new byte[magicLength];
        in.readFully(0, header, 0, magicLength);
        // if it isn't there, this isn't an ORC file
        if (!Text.decode(header, 0, magicLength).equals(OrcFile.MAGIC)) {
            throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript.");
        }
    }
}

From source file:org.apache.orc.impl.ReaderImpl.java

License:Apache License

protected OrcTail extractFileTail(FileSystem fs, Path path, long maxFileLength) throws IOException {
    FSDataInputStream file = fs.open(path);
    ByteBuffer buffer;/*from   w ww .  ja  v  a 2 s . c o m*/
    OrcProto.PostScript ps;
    OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
    long modificationTime;
    try {
        // figure out the size of the file using the option or filesystem
        long size;
        if (maxFileLength == Long.MAX_VALUE) {
            FileStatus fileStatus = fs.getFileStatus(path);
            size = fileStatus.getLen();
            modificationTime = fileStatus.getModificationTime();
        } else {
            size = maxFileLength;
            modificationTime = -1;
        }
        fileTailBuilder.setFileLength(size);

        //read last bytes into buffer to get PostScript
        int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
        buffer = ByteBuffer.allocate(readSize);
        assert buffer.position() == 0;
        file.readFully((size - readSize), buffer.array(), buffer.arrayOffset(), readSize);
        buffer.position(0);

        //read the PostScript
        //get length of PostScript
        int psLen = buffer.get(readSize - 1) & 0xff;
        ensureOrcFooter(file, path, psLen, buffer);
        int psOffset = readSize - 1 - psLen;
        ps = extractPostScript(buffer, path, psLen, psOffset);
        bufferSize = (int) ps.getCompressionBlockSize();
        codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name()));
        fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);

        int footerSize = (int) ps.getFooterLength();
        int metadataSize = (int) ps.getMetadataLength();

        //check if extra bytes need to be read
        int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
        int tailSize = 1 + psLen + footerSize + metadataSize;
        if (extra > 0) {
            //more bytes need to be read, seek back to the right place and read extra bytes
            ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
            file.readFully((size - readSize - extra), extraBuf.array(),
                    extraBuf.arrayOffset() + extraBuf.position(), extra);
            extraBuf.position(extra);
            //append with already read bytes
            extraBuf.put(buffer);
            buffer = extraBuf;
            buffer.position(0);
            buffer.limit(tailSize);
            readSize += extra;
            psOffset = readSize - 1 - psLen;
        } else {
            //footer is already in the bytes in buffer, just adjust position, length
            buffer.position(psOffset - footerSize - metadataSize);
            buffer.limit(buffer.position() + tailSize);
        }

        buffer.mark();
        int footerOffset = psOffset - footerSize;
        buffer.position(footerOffset);
        ByteBuffer footerBuffer = buffer.slice();
        buffer.reset();
        OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize, codec, bufferSize);
        fileTailBuilder.setFooter(footer);
    } finally {
        try {
            file.close();
        } catch (IOException ex) {
            LOG.error("Failed to close the file after another error", ex);
        }
    }

    ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining());
    serializedTail.put(buffer.slice());
    serializedTail.rewind();
    return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime);
}

From source file:org.apache.orc.impl.RecordReaderUtils.java

License:Apache License

/**
 * Read the list of ranges from the file.
 * @param file the file to read//from  w ww  . j ava2  s . com
 * @param base the base of the stripe
 * @param range the disk ranges within the stripe to read
 * @return the bytes read for each disk range, which is the same length as
 *    ranges
 * @throws IOException
 */
static DiskRangeList readDiskRanges(FSDataInputStream file, HadoopShims.ZeroCopyReaderShim zcr, long base,
        DiskRangeList range, boolean doForceDirect) throws IOException {
    if (range == null)
        return null;
    DiskRangeList prev = range.prev;
    if (prev == null) {
        prev = new MutateHelper(range);
    }
    while (range != null) {
        if (range.hasData()) {
            range = range.next;
            continue;
        }
        int len = (int) (range.getEnd() - range.getOffset());
        long off = range.getOffset();
        if (zcr != null) {
            file.seek(base + off);
            boolean hasReplaced = false;
            while (len > 0) {
                ByteBuffer partial = zcr.readBuffer(len, false);
                BufferChunk bc = new BufferChunk(partial, off);
                if (!hasReplaced) {
                    range.replaceSelfWith(bc);
                    hasReplaced = true;
                } else {
                    range.insertAfter(bc);
                }
                range = bc;
                int read = partial.remaining();
                len -= read;
                off += read;
            }
        } else {
            // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
            byte[] buffer = new byte[len];
            file.readFully((base + off), buffer, 0, buffer.length);
            ByteBuffer bb = null;
            if (doForceDirect) {
                bb = ByteBuffer.allocateDirect(len);
                bb.put(buffer);
                bb.position(0);
                bb.limit(len);
            } else {
                bb = ByteBuffer.wrap(buffer);
            }
            range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
        }
        range = range.next;
    }
    return prev.next;
}