List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:com.knewton.mrtool.io.JsonRecordReader.java
License:Apache License
/** * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line * by line. This separate method helps with testing too. * //from ww w. j a v a 2 s .c om * @param fileSplit * @param conf * @return * @throws IOException */ protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException { final Path file = fileSplit.getPath(); final CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(fileSplit.getPath()); seekableIn = fileIn; boolean skipFirstLine = false; LineReader lineReader; if (codec != null) { lineReader = new LineReader(codec.createInputStream(fileIn), conf); } else { // if the start is not the beginning of the file then skip the first line to get the // next complete json record. The previous json record will be read by the record reader // that got assigned the previous InputSplit. if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new LineReader(fileIn, conf); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } return lineReader; }
From source file:com.linkedin.cubert.block.BlockUtils.java
License:Open Source License
@SuppressWarnings("unchecked") public static Block loadBlock(BlockProperties props, IndexEntry indexEntry, Configuration conf, JsonNode json, BlockSerializationType serializationType, boolean isInMemoryBlock) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException { Block block;//from ww w . j a va 2s . c om if (indexEntry == null) { if (emptyForMissing) return new EmptyBlock(props); throw new IOException(String.format("Index entry is null")); } // populate props props.setBlockId(indexEntry.getBlockId()); props.setNumRecords(indexEntry.getNumRecords()); // Open the file and seek to the offset for this block Path file = new Path(indexEntry.getFile()); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE); fsin.seek(indexEntry.getOffset()); // Gather information needed to read this block Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance().newTuple().getClass(); CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file); // Load the block now if (isInMemoryBlock) { print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId()); ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry); if (byteBuffer == null) { int read = 0; byte[] data = new byte[(int) indexEntry.getLength()]; while (read != data.length) { read += fsin.read(data, read, data.length - read); } fsin.close(); byteBuffer = ByteBuffer.wrap(data); inMemoryBlockCache.put(indexEntry, byteBuffer); } else { print.f("REUSED FROM CACHE!!"); byteBuffer.rewind(); } block = new RubixMemoryBlock(props, conf, byteBuffer, valueClass, codec, serializationType); block.configure(json); return block; } else { print.f("STREAMING the block %d", indexEntry.getBlockId()); InputStream in = new BlockInputStream(fsin, indexEntry.getLength()); if (codec != null) { in = codec.createInputStream(in); } block = new CubertBlock(props, new BlockIterator<Tuple>(conf, in, valueClass, serializationType, props.getSchema())); block.configure(json); print.f("Loaded block id=%d from file=%s offset=%d length=%d", indexEntry.getBlockId(), file.toString(), indexEntry.getOffset(), indexEntry.getLength()); return block; } }
From source file:com.linkedin.cubert.io.rubix.RubixFile.java
License:Open Source License
@SuppressWarnings("unchecked") public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException { final FileSystem fs = FileSystem.get(conf); keyData = new ArrayList<KeyData<K>>(); final long filesize = fs.getFileStatus(path).getLen(); FSDataInputStream in = fs.open(path); /* The last long in the file is the start position of the trailer section */ in.seek(filesize - 8); long metaDataStartPos = in.readLong(); in.seek(metaDataStartPos);/*w w w . j a va 2 s.c o m*/ ObjectMapper mapper = new ObjectMapper(); metadataJson = mapper.readValue(in.readUTF(), JsonNode.class); int keySectionSize = in.readInt(); // load the key section byte[] keySection = new byte[keySectionSize]; in.seek(filesize - keySectionSize - 8); in.read(keySection, 0, keySectionSize); in.close(); ByteArrayInputStream bis = new ByteArrayInputStream(keySection); DataInput dataInput = new DataInputStream(bis); int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue(); // load the key section keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass")); valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass")); SerializationFactory serializationFactory = new SerializationFactory(conf); Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass); deserializer.open(bis); while (bis.available() > 0 && numberOfBlocks > 0) { K key = deserializer.deserialize(null); long offset = dataInput.readLong(); long blockId = dataInput.readLong(); long numRecords = dataInput.readLong(); keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId)); numberOfBlocks--; } // Assign length to each keydata entry int numEntries = keyData.size(); for (int i = 1; i < numEntries; i++) { KeyData<K> prev = keyData.get(i - 1); KeyData<K> current = keyData.get(i); prev.setLength(current.getOffset() - prev.getOffset()); } if (numEntries > 0) { KeyData<K> last = keyData.get(numEntries - 1); last.setLength(metaDataStartPos - last.offset); } return keyData; }
From source file:com.linkedin.cubert.io.rubix.RubixFile.java
License:Open Source License
private static void extract(RubixFile<Tuple, Object> rfile, KeyData<Tuple> keyData, String output) throws IOException, InstantiationException, IllegalAccessException, ClassNotFoundException { final int BUF_SIZE = 32 * 1024; Configuration conf = new JobConf(); File outFile = new File(output); if (outFile.exists()) outFile.delete();//from ww w. j a v a2 s. c o m outFile.createNewFile(); long offset = keyData.getOffset(); long length = keyData.getLength(); Tuple key = keyData.getKey(); print.f("Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length, rfile.path.toString()); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile)); // copy the data if (length > 0) { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(rfile.path); in.seek(offset); byte[] data = new byte[BUF_SIZE]; long toRead = length; while (toRead > 0) { int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead; in.readFully(data, 0, thisRead); bos.write(data, 0, thisRead); toRead -= thisRead; System.out.print("."); } System.out.println(); } // copy the key section ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream(); DataOutput keySectionOut = new DataOutputStream(keySectionStream); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass()); keySerializer.open(keySectionStream); keySerializer.serialize(key); keySectionOut.writeLong(0); // position keySectionOut.writeLong(keyData.getBlockId()); keySectionOut.writeLong(keyData.getNumRecords()); byte[] trailerBytes = keySectionStream.toByteArray(); JsonNode json = JsonUtils.cloneNode(rfile.metadataJson); ((ObjectNode) json).put("numberOfBlocks", 1); DataOutput out = new DataOutputStream(bos); out.writeUTF(json.toString()); out.writeInt(trailerBytes.length); out.write(trailerBytes); out.writeLong(length); // trailer start offset bos.close(); }
From source file:com.linkedin.cubert.io.rubix.RubixRecordReader.java
License:Open Source License
public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException { @SuppressWarnings("unchecked") RubixInputSplit<K, V> rsplit = (RubixInputSplit<K, V>) split; SerializationFactory serializationFactory = new SerializationFactory(conf); switch (rsplit.getBlockSerializationType()) { case DEFAULT: valueDeserializer = serializationFactory.getDeserializer(rsplit.getValueClass()); break;// ww w .j a va 2s . c o m case COMPACT: BlockSchema schema = rsplit.getSchema(); valueDeserializer = new CompactDeserializer<V>(schema); break; } key = rsplit.getKey(); // store the blockid and partition key in the conf conf.setLong("MY_BLOCK_ID", rsplit.getBlockId()); conf.setLong("MY_NUM_RECORDS", rsplit.getNumRecords()); ByteArrayOutputStream tmpOut = new ByteArrayOutputStream(); ((Tuple) key).write(new DataOutputStream(tmpOut)); String keySerialized = SerializerUtils.serializeToString(tmpOut.toByteArray()); conf.set("MY_PARTITION_KEY", keySerialized); Path path = rsplit.getFilename(); offset = rsplit.getOffset(); length = rsplit.getLength(); FileSystem fs = path.getFileSystem(conf); FSDataInputStream fsin = fs.open(path); fsin.seek(offset); in = new BlockInputStream(fsin, length); CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path); if (codec != null) { print.f("codec is not null and it is %s", codec.getClass().toString()); in = codec.createInputStream(in); } else { print.f("codec is null"); } valueDeserializer.open(in); }
From source file:com.marklogic.mapreduce.examples.WikiLoader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); byte[] buf = new byte[BUFFER_SIZE]; long bytesTotal = inSplit.getLength(); long start = ((FileSplit) inSplit).getStart(); fileIn.seek(start); long bytesRead = 0; StringBuilder pages = new StringBuilder(); int sindex = -1; while (true) { int length = (int) Math.min(bytesTotal - bytesRead, buf.length); int read = fileIn.read(buf, 0, length); if (read == -1) { System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); break; }/*w ww .ja va 2 s .c om*/ bytesRead += read; String temp = new String(new String(buf, 0, read)); if (sindex == -1) { // haven't found the start yet sindex = temp.indexOf(BEGIN_PAGE_TAG); if (sindex > -1) { pages.append(temp.substring(sindex)); } } else if (bytesRead < bytesTotal) { // haven't completed the split pages.append(temp); } else { // reached the end of this split // look for end int eindex = 0; if (temp.contains(END_DOC_TAG) || // reached the end of doc temp.endsWith(END_PAGE_TAG)) { eindex = temp.lastIndexOf(END_PAGE_TAG); pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); System.out.println("Found end of doc."); } else { // need to read ahead to look for end of page while (true) { read = fileIn.read(buf, 0, READ_AHEAD_SIZE); if (read == -1) { // no more to read System.out .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); System.out.println(temp); break; } bytesRead += read; // look for end temp = new String(buf, 0, read); eindex = temp.indexOf(END_PAGE_TAG); if (eindex > -1) { pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); break; } else { pages.append(temp); } } } break; } } fileIn.close(); articles = WikiModelProcessor.process(pages); }
From source file:com.mellanox.r4h.TestHFlush.java
License:Apache License
/** * The method starts new cluster with defined Configuration; creates a file * with specified block_size and writes 10 equal sections in it; it also calls * hflush/hsync after each write and throws an IOException in case of an error. * /*from w w w . j a v a2s . c om*/ * @param conf cluster configuration * @param fileName of the file to be created and processed as required * @param block_size value to be used for the file's creation * @param replicas is the number of replicas * @param isSync hsync or hflush * @param syncFlags specify the semantic of the sync/flush * @throws IOException in case of any errors */ public static void doTheJob(Configuration conf, final String fileName, long block_size, short replicas, boolean isSync, EnumSet<SyncFlag> syncFlags) throws IOException { byte[] fileContent; final int SECTIONS = 10; fileContent = AppendTestUtil.initBuffer(MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE()); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(replicas).build(); // Make sure we work with DFS in order to utilize all its functionality DistributedFileSystem fileSystem = (DistributedFileSystem) cluster.getFileSystem(); FSDataInputStream is; try { Path path = new Path(fileName); FSDataOutputStream stm = fileSystem.create(path, false, 4096, replicas, block_size); System.out.println("Created file " + fileName); int tenth = MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE() / SECTIONS; int rounding = MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE() - tenth * SECTIONS; for (int i = 0; i < SECTIONS; i++) { System.out.println( "Writing " + (tenth * i) + " to " + (tenth * (i + 1)) + " section to file " + fileName); // write to the file stm.write(fileContent, tenth * i, tenth); // Wait while hflush/hsync pushes all packets through built pipeline if (isSync) { ((DFSOutputStream) stm.getWrappedStream()).hsync(syncFlags); } else { ((DFSOutputStream) stm.getWrappedStream()).hflush(); } // Check file length if updatelength is required if (isSync && syncFlags.contains(SyncFlag.UPDATE_LENGTH)) { long currentFileLength = fileSystem.getFileStatus(path).getLen(); assertEquals("File size doesn't match for hsync/hflush with updating the length", tenth * (i + 1), currentFileLength); } byte[] toRead = new byte[tenth]; byte[] expected = new byte[tenth]; System.arraycopy(fileContent, tenth * i, expected, 0, tenth); // Open the same file for read. Need to create new reader after every write operation(!) is = fileSystem.open(path); is.seek(tenth * i); int readBytes = is.read(toRead, 0, tenth); System.out.println("Has read " + readBytes); assertTrue("Should've get more bytes", (readBytes > 0) && (readBytes <= tenth)); is.close(); checkData(toRead, 0, readBytes, expected, "Partial verification"); } System.out.println("Writing " + (tenth * SECTIONS) + " to " + (tenth * SECTIONS + rounding) + " section to file " + fileName); stm.write(fileContent, tenth * SECTIONS, rounding); stm.close(); assertEquals("File size doesn't match ", MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE(), fileSystem.getFileStatus(path).getLen()); AppendTestUtil.checkFullFile(fileSystem, path, fileContent.length, fileContent, "hflush()"); } finally { fileSystem.close(); cluster.shutdown(); } }
From source file:com.mellanox.r4h.TestWriteRead.java
License:Apache License
/** * read chunks into buffer repeatedly until total of VisibleLen byte are read. * Return total number of bytes read//from w ww . j a v a2 s .c o m */ private long readUntilEnd(FSDataInputStream in, byte[] buffer, long size, String fname, long pos, long visibleLen, boolean positionReadOption) throws IOException { if (pos >= visibleLen || visibleLen <= 0) return 0; int chunkNumber = 0; long totalByteRead = 0; long currentPosition = pos; int byteRead = 0; long byteLeftToRead = visibleLen - pos; int byteToReadThisRound = 0; if (!positionReadOption) { in.seek(pos); currentPosition = in.getPos(); } if (verboseOption) LOG.info("reader begin: position: " + pos + " ; currentOffset = " + currentPosition + " ; bufferSize =" + buffer.length + " ; Filename = " + fname); try { while (byteLeftToRead > 0 && currentPosition < visibleLen) { byteToReadThisRound = (int) (byteLeftToRead >= buffer.length ? buffer.length : byteLeftToRead); if (positionReadOption) { byteRead = in.read(currentPosition, buffer, 0, byteToReadThisRound); } else { byteRead = in.read(buffer, 0, byteToReadThisRound); } if (byteRead <= 0) break; chunkNumber++; totalByteRead += byteRead; currentPosition += byteRead; byteLeftToRead -= byteRead; if (verboseOption) { LOG.info("reader: Number of byte read: " + byteRead + " ; totalByteRead = " + totalByteRead + " ; currentPosition=" + currentPosition + " ; chunkNumber =" + chunkNumber + "; File name = " + fname); } } } catch (IOException e) { throw new IOException("#### Exception caught in readUntilEnd: reader currentOffset = " + currentPosition + " ; totalByteRead =" + totalByteRead + " ; latest byteRead = " + byteRead + "; visibleLen= " + visibleLen + " ; bufferLen = " + buffer.length + " ; Filename = " + fname, e); } if (verboseOption) LOG.info("reader end: position: " + pos + " ; currentOffset = " + currentPosition + " ; totalByteRead =" + totalByteRead + " ; Filename = " + fname); return totalByteRead; }
From source file:com.nearinfinity.mele.store.hdfs.HdfsDirectory.java
License:Apache License
@Override public IndexInput openInput(final String name) throws IOException { final FSDataInputStream inputStream = fileSystem.open(new Path(hdfsDirPath, name)); return new BufferedIndexInput() { private long length = fileLength(name); @Override// w ww. j a v a 2s. co m public long length() { return length; } @Override public void close() throws IOException { inputStream.close(); } @Override protected void seekInternal(long pos) throws IOException { } @Override protected void readInternal(byte[] b, int offset, int length) throws IOException { synchronized (inputStream) { long position = getFilePointer(); inputStream.seek(position); inputStream.read(b, offset, length); } } }; }
From source file:com.ramsane.samplehadoop.ReadTwice.java
public static void main(String[] args) { Configuration cfg = new Configuration(); cfg.set("fs.defaultFS", "hdfs://localhost:9000"); FSDataInputStream in = null; try {//w ww .j a va 2s .co m FileSystem fs = FileSystem.get(cfg); in = fs.open(new Path("/big")); System.out.println("First TIme..."); IOUtils.copyBytes(in, System.out, 4096, false); System.out.println("Second time.."); in.seek(0); IOUtils.copyBytes(in, System.out, 4096, false); } catch (IOException ex) { System.out.println(ex.getMessage()); } finally { IOUtils.closeStream(in); } }