List of usage examples for org.apache.hadoop.fs FSDataInputStream readFully
@Override public void readFully(long position, byte[] buffer, int offset, int length) throws IOException
From source file:com.github.sadikovi.riff.FileFooter.java
License:Open Source License
/** * Read footer from output stream.//ww w. j a v a2s . c om * Footer is assumed to be placed at the end of the stream. Seek is performed inside the method. * Stream is not closed after operation is complete. * @param in input stream * @param maxSize maximum stream size * @throws IOException */ public static FileFooter readFrom(FSDataInputStream in, long maxSize) throws IOException { int tailOffset = 8; // stream size must be larger than magic + length if (maxSize < tailOffset) { throw new IOException("Invalid stream, cannot read footer: " + maxSize + " < " + tailOffset); } // Read 8 bytes: magic 4 bytes and length of the header 4 bytes ByteBuffer buffer = ByteBuffer.allocate(tailOffset); in.readFully(maxSize - tailOffset, buffer.array(), buffer.arrayOffset(), tailOffset); // reconstruct magic and written bytes long meta = buffer.getLong(); int magic = (int) (meta >>> 32); if (magic != Riff.MAGIC) throw new IOException("Wrong magic: " + magic + " != " + Riff.MAGIC); int len = (int) (meta & 0x7fffffff); LOG.debug("Read footer content of {} bytes", len); // read full footer bytes buffer = ByteBuffer.allocate(len); in.readFully(maxSize - tailOffset - len, buffer.array(), buffer.arrayOffset(), len); // no flip - we have not reset position long numRecords = buffer.getLong(); // read file statistics Statistics[] fileStats = new Statistics[buffer.getInt()]; int i = 0; while (i < fileStats.length) { fileStats[i] = Statistics.readExternal(buffer); LOG.debug("Read file statistics {}", fileStats[i]); ++i; } return new FileFooter(fileStats, numRecords, buffer); }
From source file:org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultTolerance2ITCase.java
License:Apache License
@Override public void postSubmit() throws Exception { // We read the files and verify that we have read all the strings. If a valid-length // file exists we only read the file to that point. (This test should work with // FileSystems that support truncate() and with others as well.) Pattern messageRegex = Pattern.compile("message (\\d*)"); // Keep a set of the message IDs that we read. The size must equal the read count and // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some // elements twice. Set<Integer> readNumbers = Sets.newHashSet(); int numRead = 0; RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true); while (files.hasNext()) { LocatedFileStatus file = files.next(); if (!file.getPath().toString().endsWith(".valid-length")) { int validLength = (int) file.getLen(); if (dfs.exists(file.getPath().suffix(".valid-length"))) { FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length")); String validLengthString = inStream.readUTF(); validLength = Integer.parseInt(validLengthString); System.out.println("VALID LENGTH: " + validLength); }/* w w w .j av a 2s. c o m*/ FSDataInputStream inStream = dfs.open(file.getPath()); byte[] buffer = new byte[validLength]; inStream.readFully(0, buffer, 0, validLength); inStream.close(); ByteArrayInputStream bais = new ByteArrayInputStream(buffer); InputStreamReader inStreamReader = new InputStreamReader(bais); BufferedReader br = new BufferedReader(inStreamReader); String line = br.readLine(); while (line != null) { Matcher matcher = messageRegex.matcher(line); if (matcher.matches()) { numRead++; int messageId = Integer.parseInt(matcher.group(1)); readNumbers.add(messageId); } else { Assert.fail("Read line does not match expected pattern."); } line = br.readLine(); } br.close(); inStreamReader.close(); bais.close(); } } // Verify that we read all strings (at-least-once) Assert.assertEquals(NUM_STRINGS, readNumbers.size()); // Verify that we don't have duplicates (boom!, exactly-once) Assert.assertEquals(NUM_STRINGS, numRead); }
From source file:org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultToleranceITCase.java
License:Apache License
@Override public void postSubmit() throws Exception { // We read the files and verify that we have read all the strings. If a valid-length // file exists we only read the file to that point. (This test should work with // FileSystems that support truncate() and with others as well.) Pattern messageRegex = Pattern.compile("message (\\d*)"); // Keep a set of the message IDs that we read. The size must equal the read count and // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some // elements twice. Set<Integer> readNumbers = Sets.newHashSet(); HashSet<String> uniqMessagesRead = new HashSet<>(); HashSet<String> messagesInCommittedFiles = new HashSet<>(); RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true); while (files.hasNext()) { LocatedFileStatus file = files.next(); if (!file.getPath().toString().endsWith(".valid-length")) { int validLength = (int) file.getLen(); if (dfs.exists(file.getPath().suffix(".valid-length"))) { FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length")); String validLengthString = inStream.readUTF(); validLength = Integer.parseInt(validLengthString); System.out.println("VALID LENGTH: " + validLength); }//from w w w . ja v a 2 s.c o m FSDataInputStream inStream = dfs.open(file.getPath()); byte[] buffer = new byte[validLength]; inStream.readFully(0, buffer, 0, validLength); inStream.close(); ByteArrayInputStream bais = new ByteArrayInputStream(buffer); InputStreamReader inStreamReader = new InputStreamReader(bais); BufferedReader br = new BufferedReader(inStreamReader); String line = br.readLine(); while (line != null) { Matcher matcher = messageRegex.matcher(line); if (matcher.matches()) { uniqMessagesRead.add(line); // check that in the committed files there are no duplicates if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX) && !file.getPath().toString().endsWith(PENDING_SUFFIX)) { if (!messagesInCommittedFiles.add(line)) { Assert.fail("Duplicate entry in committed bucket."); } } int messageId = Integer.parseInt(matcher.group(1)); readNumbers.add(messageId); } else { Assert.fail("Read line does not match expected pattern."); } line = br.readLine(); } br.close(); inStreamReader.close(); bais.close(); } } // Verify that we read all strings (at-least-once) Assert.assertEquals(NUM_STRINGS, readNumbers.size()); // Verify that we don't have duplicates (boom!, exactly-once) Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size()); }
From source file:org.apache.hive.hcatalog.streaming.TestStreaming.java
License:Apache License
private void corruptDataFile(final String file, final Configuration conf, final int addRemoveBytes) throws Exception { Path bPath = new Path(file); Path cPath = new Path(bPath.getParent(), bPath.getName() + ".corrupt"); FileSystem fs = bPath.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(bPath); int len = addRemoveBytes == Integer.MIN_VALUE ? 0 : (int) fileStatus.getLen() + addRemoveBytes; byte[] buffer = new byte[len]; FSDataInputStream fdis = fs.open(bPath); fdis.readFully(0, buffer, 0, (int) Math.min(fileStatus.getLen(), buffer.length)); fdis.close();/*from w w w . j a v a 2s . c o m*/ FSDataOutputStream fdos = fs.create(cPath, true); fdos.write(buffer, 0, buffer.length); fdos.close(); fs.delete(bPath, false); fs.rename(cPath, bPath); }
From source file:org.apache.ignite.igfs.HadoopIgfsDualAbstractSelfTest.java
License:Apache License
/** * Check how prefetch override works.//from w ww. j av a 2 s . c o m * * @throws Exception IF failed. */ public void testOpenPrefetchOverride() throws Exception { create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE)); // Write enough data to the secondary file system. final int blockSize = IGFS_BLOCK_SIZE; IgfsOutputStream out = igfsSecondary.append(FILE, false); int totalWritten = 0; while (totalWritten < blockSize * 2 + chunk.length) { out.write(chunk); totalWritten += chunk.length; } out.close(); awaitFileClose(igfsSecondary.asSecondary(), FILE); // Instantiate file system with overridden "seq reads before prefetch" property. Configuration cfg = new Configuration(); cfg.addResource(U.resolveIgniteUrl(PRIMARY_CFG)); int seqReads = SEQ_READS_BEFORE_PREFETCH + 1; cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs:grid@"), seqReads); FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg); // Read the first two blocks. Path fsHome = new Path(PRIMARY_URI); Path dir = new Path(fsHome, DIR.name()); Path subdir = new Path(dir, SUBDIR.name()); Path file = new Path(subdir, FILE.name()); FSDataInputStream fsIn = fs.open(file); final byte[] readBuf = new byte[blockSize * 2]; fsIn.readFully(0, readBuf, 0, readBuf.length); // Wait for a while for prefetch to finish (if any). IgfsMetaManager meta = igfs.context().meta(); IgfsFileInfo info = meta.info(meta.fileId(FILE)); IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2); GridCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache() .cache(igfs.configuration().getDataCacheName()); for (int i = 0; i < 10; i++) { if (dataCache.containsKey(key)) break; else U.sleep(100); } fsIn.close(); // Remove the file from the secondary file system. igfsSecondary.delete(FILE, false); // Try reading the third block. Should fail. GridTestUtils.assertThrows(log, new Callable<Object>() { @Override public Object call() throws Exception { IgfsInputStream in0 = igfs.open(FILE); in0.seek(blockSize * 2); try { in0.read(readBuf); } finally { U.closeQuiet(in0); } return null; } }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file"); }
From source file:org.apache.ignite.igfs.IgfsHadoopDualAbstractSelfTest.java
License:Apache License
/** * Check how prefetch override works.// w w w. j a v a 2 s . co m * * @throws Exception IF failed. */ public void testOpenPrefetchOverride() throws Exception { create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE)); // Write enough data to the secondary file system. final int blockSize = IGFS_BLOCK_SIZE; IgfsOutputStream out = igfsSecondary.append(FILE, false); int totalWritten = 0; while (totalWritten < blockSize * 2 + chunk.length) { out.write(chunk); totalWritten += chunk.length; } out.close(); awaitFileClose(igfsSecondary, FILE); // Instantiate file system with overridden "seq reads before prefetch" property. Configuration cfg = new Configuration(); cfg.addResource(U.resolveIgniteUrl(PRIMARY_CFG)); int seqReads = SEQ_READS_BEFORE_PREFETCH + 1; cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs:grid@"), seqReads); FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg); // Read the first two blocks. Path fsHome = new Path(PRIMARY_URI); Path dir = new Path(fsHome, DIR.name()); Path subdir = new Path(dir, SUBDIR.name()); Path file = new Path(subdir, FILE.name()); FSDataInputStream fsIn = fs.open(file); final byte[] readBuf = new byte[blockSize * 2]; fsIn.readFully(0, readBuf, 0, readBuf.length); // Wait for a while for prefetch to finish (if any). IgfsMetaManager meta = igfs.context().meta(); IgfsFileInfo info = meta.info(meta.fileId(FILE)); IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2); GridCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache() .cache(igfs.configuration().getDataCacheName()); for (int i = 0; i < 10; i++) { if (dataCache.containsKey(key)) break; else U.sleep(100); } fsIn.close(); // Remove the file from the secondary file system. igfsSecondary.delete(FILE, false); // Try reading the third block. Should fail. GridTestUtils.assertThrows(log, new Callable<Object>() { @Override public Object call() throws Exception { IgfsInputStream in0 = igfs.open(FILE); in0.seek(blockSize * 2); try { in0.read(readBuf); } finally { U.closeQuiet(in0); } return null; } }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file"); }
From source file:org.apache.ignite.internal.processors.hadoop.impl.igfs.HadoopIgfsDualAbstractSelfTest.java
License:Apache License
/** * Check how prefetch override works./* w w w . ja va 2 s . c om*/ * * @throws Exception IF failed. */ public void testOpenPrefetchOverride() throws Exception { create(igfsSecondary, paths(DIR, SUBDIR), paths(FILE)); // Write enough data to the secondary file system. final int blockSize = IGFS_BLOCK_SIZE; IgfsOutputStream out = igfsSecondary.append(FILE, false); int totalWritten = 0; while (totalWritten < blockSize * 2 + chunk.length) { out.write(chunk); totalWritten += chunk.length; } out.close(); awaitFileClose(igfsSecondary, FILE); // Instantiate file system with overridden "seq reads before prefetch" property. Configuration cfg = new Configuration(); cfg.addResource(U.resolveIgniteUrl(PRIMARY_CFG)); int seqReads = SEQ_READS_BEFORE_PREFETCH + 1; cfg.setInt(String.format(PARAM_IGFS_SEQ_READS_BEFORE_PREFETCH, "igfs@"), seqReads); FileSystem fs = FileSystem.get(new URI(PRIMARY_URI), cfg); // Read the first two blocks. Path fsHome = new Path(PRIMARY_URI); Path dir = new Path(fsHome, DIR.name()); Path subdir = new Path(dir, SUBDIR.name()); Path file = new Path(subdir, FILE.name()); FSDataInputStream fsIn = fs.open(file); final byte[] readBuf = new byte[blockSize * 2]; fsIn.readFully(0, readBuf, 0, readBuf.length); // Wait for a while for prefetch to finish (if any). IgfsMetaManager meta = igfs.context().meta(); IgfsEntryInfo info = meta.info(meta.fileId(FILE)); IgfsBlockKey key = new IgfsBlockKey(info.id(), info.affinityKey(), info.evictExclude(), 2); IgniteCache<IgfsBlockKey, byte[]> dataCache = igfs.context().kernalContext().cache() .jcache(igfs.configuration().getDataCacheConfiguration().getName()); for (int i = 0; i < 10; i++) { if (dataCache.containsKey(key)) break; else U.sleep(100); } fsIn.close(); // Remove the file from the secondary file system. igfsSecondary.delete(FILE, false); // Try reading the third block. Should fail. GridTestUtils.assertThrows(log, new Callable<Object>() { @Override public Object call() throws Exception { IgfsInputStream in0 = igfs.open(FILE); in0.seek(blockSize * 2); try { in0.read(readBuf); } finally { U.closeQuiet(in0); } return null; } }, IOException.class, "Failed to read data due to secondary file system exception: /dir/subdir/file"); }
From source file:org.apache.orc.impl.ReaderImpl.java
License:Apache License
/** * Ensure this is an ORC file to prevent users from trying to read text * files or RC files as ORC files./*from w w w . j a va2 s. com*/ * @param in the file being read * @param path the filename for error messages * @param psLen the postscript length * @param buffer the tail of the file * @throws IOException */ protected static void ensureOrcFooter(FSDataInputStream in, Path path, int psLen, ByteBuffer buffer) throws IOException { int magicLength = OrcFile.MAGIC.length(); int fullLength = magicLength + 1; if (psLen < fullLength || buffer.remaining() < fullLength) { throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript length " + psLen); } int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength; byte[] array = buffer.array(); // now look for the magic string at the end of the postscript. if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) { // If it isn't there, this may be the 0.11.0 version of ORC. // Read the first 3 bytes of the file to check for the header byte[] header = new byte[magicLength]; in.readFully(0, header, 0, magicLength); // if it isn't there, this isn't an ORC file if (!Text.decode(header, 0, magicLength).equals(OrcFile.MAGIC)) { throw new FileFormatException("Malformed ORC file " + path + ". Invalid postscript."); } } }
From source file:org.apache.orc.impl.ReaderImpl.java
License:Apache License
protected OrcTail extractFileTail(FileSystem fs, Path path, long maxFileLength) throws IOException { FSDataInputStream file = fs.open(path); ByteBuffer buffer;/*from w ww . ja v a 2 s . c o m*/ OrcProto.PostScript ps; OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(); long modificationTime; try { // figure out the size of the file using the option or filesystem long size; if (maxFileLength == Long.MAX_VALUE) { FileStatus fileStatus = fs.getFileStatus(path); size = fileStatus.getLen(); modificationTime = fileStatus.getModificationTime(); } else { size = maxFileLength; modificationTime = -1; } fileTailBuilder.setFileLength(size); //read last bytes into buffer to get PostScript int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); buffer = ByteBuffer.allocate(readSize); assert buffer.position() == 0; file.readFully((size - readSize), buffer.array(), buffer.arrayOffset(), readSize); buffer.position(0); //read the PostScript //get length of PostScript int psLen = buffer.get(readSize - 1) & 0xff; ensureOrcFooter(file, path, psLen, buffer); int psOffset = readSize - 1 - psLen; ps = extractPostScript(buffer, path, psLen, psOffset); bufferSize = (int) ps.getCompressionBlockSize(); codec = WriterImpl.createCodec(CompressionKind.valueOf(ps.getCompression().name())); fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps); int footerSize = (int) ps.getFooterLength(); int metadataSize = (int) ps.getMetadataLength(); //check if extra bytes need to be read int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize); int tailSize = 1 + psLen + footerSize + metadataSize; if (extra > 0) { //more bytes need to be read, seek back to the right place and read extra bytes ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize); file.readFully((size - readSize - extra), extraBuf.array(), extraBuf.arrayOffset() + extraBuf.position(), extra); extraBuf.position(extra); //append with already read bytes extraBuf.put(buffer); buffer = extraBuf; buffer.position(0); buffer.limit(tailSize); readSize += extra; psOffset = readSize - 1 - psLen; } else { //footer is already in the bytes in buffer, just adjust position, length buffer.position(psOffset - footerSize - metadataSize); buffer.limit(buffer.position() + tailSize); } buffer.mark(); int footerOffset = psOffset - footerSize; buffer.position(footerOffset); ByteBuffer footerBuffer = buffer.slice(); buffer.reset(); OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize, codec, bufferSize); fileTailBuilder.setFooter(footer); } finally { try { file.close(); } catch (IOException ex) { LOG.error("Failed to close the file after another error", ex); } } ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining()); serializedTail.put(buffer.slice()); serializedTail.rewind(); return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime); }
From source file:org.apache.orc.impl.RecordReaderUtils.java
License:Apache License
/** * Read the list of ranges from the file. * @param file the file to read//from w ww . j ava2 s . com * @param base the base of the stripe * @param range the disk ranges within the stripe to read * @return the bytes read for each disk range, which is the same length as * ranges * @throws IOException */ static DiskRangeList readDiskRanges(FSDataInputStream file, HadoopShims.ZeroCopyReaderShim zcr, long base, DiskRangeList range, boolean doForceDirect) throws IOException { if (range == null) return null; DiskRangeList prev = range.prev; if (prev == null) { prev = new MutateHelper(range); } while (range != null) { if (range.hasData()) { range = range.next; continue; } int len = (int) (range.getEnd() - range.getOffset()); long off = range.getOffset(); if (zcr != null) { file.seek(base + off); boolean hasReplaced = false; while (len > 0) { ByteBuffer partial = zcr.readBuffer(len, false); BufferChunk bc = new BufferChunk(partial, off); if (!hasReplaced) { range.replaceSelfWith(bc); hasReplaced = true; } else { range.insertAfter(bc); } range = bc; int read = partial.remaining(); len -= read; off += read; } } else { // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless. byte[] buffer = new byte[len]; file.readFully((base + off), buffer, 0, buffer.length); ByteBuffer bb = null; if (doForceDirect) { bb = ByteBuffer.allocateDirect(len); bb.put(buffer); bb.position(0); bb.limit(len); } else { bb = ByteBuffer.wrap(buffer); } range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset())); } range = range.next; } return prev.next; }