Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:com.knewton.mrtool.io.JsonRecordReader.java

License:Apache License

/**
 * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line
 * by line. This separate method helps with testing too.
 * //from ww  w.  j a  v a 2 s .c  om
 * @param fileSplit
 * @param conf
 * @return
 * @throws IOException
 */
protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException {
    final Path file = fileSplit.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(fileSplit.getPath());
    seekableIn = fileIn;
    boolean skipFirstLine = false;
    LineReader lineReader;
    if (codec != null) {
        lineReader = new LineReader(codec.createInputStream(fileIn), conf);
    } else {
        // if the start is not the beginning of the file then skip the first line to get the
        // next complete json record. The previous json record will be read by the record reader
        // that got assigned the previous InputSplit.
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new LineReader(fileIn, conf);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    return lineReader;
}

From source file:com.linkedin.cubert.block.BlockUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
public static Block loadBlock(BlockProperties props, IndexEntry indexEntry, Configuration conf, JsonNode json,
        BlockSerializationType serializationType, boolean isInMemoryBlock) throws IOException,
        ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException {
    Block block;//from ww  w  . j a va  2s .  c om
    if (indexEntry == null) {
        if (emptyForMissing)
            return new EmptyBlock(props);

        throw new IOException(String.format("Index entry is null"));
    }

    // populate props
    props.setBlockId(indexEntry.getBlockId());
    props.setNumRecords(indexEntry.getNumRecords());

    // Open the file and seek to the offset for this block
    Path file = new Path(indexEntry.getFile());
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE);
    fsin.seek(indexEntry.getOffset());

    // Gather information needed to read this block
    Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance().newTuple().getClass();
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);

    // Load the block now
    if (isInMemoryBlock) {
        print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId());

        ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry);

        if (byteBuffer == null) {
            int read = 0;
            byte[] data = new byte[(int) indexEntry.getLength()];
            while (read != data.length) {
                read += fsin.read(data, read, data.length - read);
            }
            fsin.close();

            byteBuffer = ByteBuffer.wrap(data);

            inMemoryBlockCache.put(indexEntry, byteBuffer);
        } else {
            print.f("REUSED FROM CACHE!!");
            byteBuffer.rewind();
        }

        block = new RubixMemoryBlock(props, conf, byteBuffer, valueClass, codec, serializationType);
        block.configure(json);
        return block;
    } else {
        print.f("STREAMING the block %d", indexEntry.getBlockId());
        InputStream in = new BlockInputStream(fsin, indexEntry.getLength());

        if (codec != null) {
            in = codec.createInputStream(in);
        }

        block = new CubertBlock(props,
                new BlockIterator<Tuple>(conf, in, valueClass, serializationType, props.getSchema()));
        block.configure(json);

        print.f("Loaded block id=%d from file=%s offset=%d length=%d", indexEntry.getBlockId(), file.toString(),
                indexEntry.getOffset(), indexEntry.getLength());

        return block;
    }
}

From source file:com.linkedin.cubert.io.rubix.RubixFile.java

License:Open Source License

@SuppressWarnings("unchecked")
public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException {
    final FileSystem fs = FileSystem.get(conf);
    keyData = new ArrayList<KeyData<K>>();

    final long filesize = fs.getFileStatus(path).getLen();
    FSDataInputStream in = fs.open(path);

    /* The last long in the file is the start position of the trailer section */
    in.seek(filesize - 8);
    long metaDataStartPos = in.readLong();

    in.seek(metaDataStartPos);/*w w  w . j  a  va 2  s.c  o  m*/

    ObjectMapper mapper = new ObjectMapper();
    metadataJson = mapper.readValue(in.readUTF(), JsonNode.class);

    int keySectionSize = in.readInt();

    // load the key section
    byte[] keySection = new byte[keySectionSize];

    in.seek(filesize - keySectionSize - 8);
    in.read(keySection, 0, keySectionSize);
    in.close();

    ByteArrayInputStream bis = new ByteArrayInputStream(keySection);
    DataInput dataInput = new DataInputStream(bis);

    int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue();

    // load the key section
    keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass"));
    valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass"));

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass);

    deserializer.open(bis);

    while (bis.available() > 0 && numberOfBlocks > 0) {
        K key = deserializer.deserialize(null);

        long offset = dataInput.readLong();
        long blockId = dataInput.readLong();
        long numRecords = dataInput.readLong();

        keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId));
        numberOfBlocks--;
    }

    // Assign length to each keydata entry
    int numEntries = keyData.size();
    for (int i = 1; i < numEntries; i++) {
        KeyData<K> prev = keyData.get(i - 1);
        KeyData<K> current = keyData.get(i);

        prev.setLength(current.getOffset() - prev.getOffset());
    }

    if (numEntries > 0) {
        KeyData<K> last = keyData.get(numEntries - 1);
        last.setLength(metaDataStartPos - last.offset);
    }

    return keyData;
}

From source file:com.linkedin.cubert.io.rubix.RubixFile.java

License:Open Source License

private static void extract(RubixFile<Tuple, Object> rfile, KeyData<Tuple> keyData, String output)
        throws IOException, InstantiationException, IllegalAccessException, ClassNotFoundException {
    final int BUF_SIZE = 32 * 1024;

    Configuration conf = new JobConf();
    File outFile = new File(output);
    if (outFile.exists())
        outFile.delete();//from   ww w.  j a v  a2 s.  c o  m
    outFile.createNewFile();

    long offset = keyData.getOffset();
    long length = keyData.getLength();
    Tuple key = keyData.getKey();

    print.f("Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length,
            rfile.path.toString());

    BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile));

    // copy the data
    if (length > 0) {
        FileSystem fs = FileSystem.get(conf);
        FSDataInputStream in = fs.open(rfile.path);
        in.seek(offset);

        byte[] data = new byte[BUF_SIZE];
        long toRead = length;
        while (toRead > 0) {
            int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead;
            in.readFully(data, 0, thisRead);
            bos.write(data, 0, thisRead);
            toRead -= thisRead;
            System.out.print(".");
        }
        System.out.println();
    }
    // copy the key section
    ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream();
    DataOutput keySectionOut = new DataOutputStream(keySectionStream);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass());
    keySerializer.open(keySectionStream);

    keySerializer.serialize(key);
    keySectionOut.writeLong(0); // position
    keySectionOut.writeLong(keyData.getBlockId());
    keySectionOut.writeLong(keyData.getNumRecords());

    byte[] trailerBytes = keySectionStream.toByteArray();

    JsonNode json = JsonUtils.cloneNode(rfile.metadataJson);
    ((ObjectNode) json).put("numberOfBlocks", 1);

    DataOutput out = new DataOutputStream(bos);
    out.writeUTF(json.toString());
    out.writeInt(trailerBytes.length);
    out.write(trailerBytes);
    out.writeLong(length); // trailer start offset
    bos.close();
}

From source file:com.linkedin.cubert.io.rubix.RubixRecordReader.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    @SuppressWarnings("unchecked")
    RubixInputSplit<K, V> rsplit = (RubixInputSplit<K, V>) split;

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    switch (rsplit.getBlockSerializationType()) {
    case DEFAULT:
        valueDeserializer = serializationFactory.getDeserializer(rsplit.getValueClass());
        break;// ww  w  .j  a va  2s  .  c  o m
    case COMPACT:
        BlockSchema schema = rsplit.getSchema();
        valueDeserializer = new CompactDeserializer<V>(schema);
        break;
    }

    key = rsplit.getKey();

    // store the blockid and partition key in the conf
    conf.setLong("MY_BLOCK_ID", rsplit.getBlockId());
    conf.setLong("MY_NUM_RECORDS", rsplit.getNumRecords());
    ByteArrayOutputStream tmpOut = new ByteArrayOutputStream();
    ((Tuple) key).write(new DataOutputStream(tmpOut));
    String keySerialized = SerializerUtils.serializeToString(tmpOut.toByteArray());
    conf.set("MY_PARTITION_KEY", keySerialized);

    Path path = rsplit.getFilename();
    offset = rsplit.getOffset();
    length = rsplit.getLength();

    FileSystem fs = path.getFileSystem(conf);
    FSDataInputStream fsin = fs.open(path);
    fsin.seek(offset);

    in = new BlockInputStream(fsin, length);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
    if (codec != null) {
        print.f("codec is not null and it is %s", codec.getClass().toString());
        in = codec.createInputStream(in);
    } else {
        print.f("codec is null");
    }

    valueDeserializer.open(in);
}

From source file:com.marklogic.mapreduce.examples.WikiLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit) inSplit).getStart();
    fileIn.seek(start);
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int) Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
            break;
        }/*w  ww .ja  va  2  s  .c om*/
        bytesRead += read;
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                    temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out
                                .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}

From source file:com.mellanox.r4h.TestHFlush.java

License:Apache License

/**
 * The method starts new cluster with defined Configuration; creates a file
 * with specified block_size and writes 10 equal sections in it; it also calls
 * hflush/hsync after each write and throws an IOException in case of an error.
 * /*from   w  w  w .  j a v a2s .  c  om*/
 * @param conf cluster configuration
 * @param fileName of the file to be created and processed as required
 * @param block_size value to be used for the file's creation
 * @param replicas is the number of replicas
 * @param isSync hsync or hflush         
 * @param syncFlags specify the semantic of the sync/flush
 * @throws IOException in case of any errors
 */
public static void doTheJob(Configuration conf, final String fileName, long block_size, short replicas,
        boolean isSync, EnumSet<SyncFlag> syncFlags) throws IOException {
    byte[] fileContent;
    final int SECTIONS = 10;

    fileContent = AppendTestUtil.initBuffer(MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE());
    MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(replicas).build();
    // Make sure we work with DFS in order to utilize all its functionality
    DistributedFileSystem fileSystem = (DistributedFileSystem) cluster.getFileSystem();

    FSDataInputStream is;
    try {
        Path path = new Path(fileName);
        FSDataOutputStream stm = fileSystem.create(path, false, 4096, replicas, block_size);
        System.out.println("Created file " + fileName);

        int tenth = MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE() / SECTIONS;
        int rounding = MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE() - tenth * SECTIONS;
        for (int i = 0; i < SECTIONS; i++) {
            System.out.println(
                    "Writing " + (tenth * i) + " to " + (tenth * (i + 1)) + " section to file " + fileName);
            // write to the file
            stm.write(fileContent, tenth * i, tenth);

            // Wait while hflush/hsync pushes all packets through built pipeline
            if (isSync) {
                ((DFSOutputStream) stm.getWrappedStream()).hsync(syncFlags);
            } else {
                ((DFSOutputStream) stm.getWrappedStream()).hflush();
            }

            // Check file length if updatelength is required
            if (isSync && syncFlags.contains(SyncFlag.UPDATE_LENGTH)) {
                long currentFileLength = fileSystem.getFileStatus(path).getLen();
                assertEquals("File size doesn't match for hsync/hflush with updating the length",
                        tenth * (i + 1), currentFileLength);
            }
            byte[] toRead = new byte[tenth];
            byte[] expected = new byte[tenth];
            System.arraycopy(fileContent, tenth * i, expected, 0, tenth);
            // Open the same file for read. Need to create new reader after every write operation(!)
            is = fileSystem.open(path);
            is.seek(tenth * i);
            int readBytes = is.read(toRead, 0, tenth);
            System.out.println("Has read " + readBytes);
            assertTrue("Should've get more bytes", (readBytes > 0) && (readBytes <= tenth));
            is.close();
            checkData(toRead, 0, readBytes, expected, "Partial verification");
        }
        System.out.println("Writing " + (tenth * SECTIONS) + " to " + (tenth * SECTIONS + rounding)
                + " section to file " + fileName);
        stm.write(fileContent, tenth * SECTIONS, rounding);
        stm.close();

        assertEquals("File size doesn't match ", MiniDFSClusterBridge.getAppendTestUtils_FILE_SIZE(),
                fileSystem.getFileStatus(path).getLen());
        AppendTestUtil.checkFullFile(fileSystem, path, fileContent.length, fileContent, "hflush()");
    } finally {
        fileSystem.close();
        cluster.shutdown();
    }
}

From source file:com.mellanox.r4h.TestWriteRead.java

License:Apache License

/**
 * read chunks into buffer repeatedly until total of VisibleLen byte are read.
 * Return total number of bytes read//from  w ww  .  j a v  a2 s .c  o m
 */
private long readUntilEnd(FSDataInputStream in, byte[] buffer, long size, String fname, long pos,
        long visibleLen, boolean positionReadOption) throws IOException {

    if (pos >= visibleLen || visibleLen <= 0)
        return 0;

    int chunkNumber = 0;
    long totalByteRead = 0;
    long currentPosition = pos;
    int byteRead = 0;
    long byteLeftToRead = visibleLen - pos;
    int byteToReadThisRound = 0;

    if (!positionReadOption) {
        in.seek(pos);
        currentPosition = in.getPos();
    }
    if (verboseOption)
        LOG.info("reader begin: position: " + pos + " ; currentOffset = " + currentPosition + " ; bufferSize ="
                + buffer.length + " ; Filename = " + fname);
    try {
        while (byteLeftToRead > 0 && currentPosition < visibleLen) {
            byteToReadThisRound = (int) (byteLeftToRead >= buffer.length ? buffer.length : byteLeftToRead);
            if (positionReadOption) {
                byteRead = in.read(currentPosition, buffer, 0, byteToReadThisRound);
            } else {
                byteRead = in.read(buffer, 0, byteToReadThisRound);
            }
            if (byteRead <= 0)
                break;
            chunkNumber++;
            totalByteRead += byteRead;
            currentPosition += byteRead;
            byteLeftToRead -= byteRead;

            if (verboseOption) {
                LOG.info("reader: Number of byte read: " + byteRead + " ; totalByteRead = " + totalByteRead
                        + " ; currentPosition=" + currentPosition + " ; chunkNumber =" + chunkNumber
                        + "; File name = " + fname);
            }
        }
    } catch (IOException e) {
        throw new IOException("#### Exception caught in readUntilEnd: reader  currentOffset = "
                + currentPosition + " ; totalByteRead =" + totalByteRead + " ; latest byteRead = " + byteRead
                + "; visibleLen= " + visibleLen + " ; bufferLen = " + buffer.length + " ; Filename = " + fname,
                e);
    }

    if (verboseOption)
        LOG.info("reader end:   position: " + pos + " ; currentOffset = " + currentPosition
                + " ; totalByteRead =" + totalByteRead + " ; Filename = " + fname);

    return totalByteRead;
}

From source file:com.nearinfinity.mele.store.hdfs.HdfsDirectory.java

License:Apache License

@Override
public IndexInput openInput(final String name) throws IOException {
    final FSDataInputStream inputStream = fileSystem.open(new Path(hdfsDirPath, name));
    return new BufferedIndexInput() {

        private long length = fileLength(name);

        @Override// w ww. j a  v  a 2s. co m
        public long length() {
            return length;
        }

        @Override
        public void close() throws IOException {
            inputStream.close();
        }

        @Override
        protected void seekInternal(long pos) throws IOException {

        }

        @Override
        protected void readInternal(byte[] b, int offset, int length) throws IOException {
            synchronized (inputStream) {
                long position = getFilePointer();
                inputStream.seek(position);
                inputStream.read(b, offset, length);
            }
        }
    };
}

From source file:com.ramsane.samplehadoop.ReadTwice.java

public static void main(String[] args) {
    Configuration cfg = new Configuration();
    cfg.set("fs.defaultFS", "hdfs://localhost:9000");
    FSDataInputStream in = null;
    try {//w ww .j a  va  2s .co m
        FileSystem fs = FileSystem.get(cfg);
        in = fs.open(new Path("/big"));
        System.out.println("First TIme...");
        IOUtils.copyBytes(in, System.out, 4096, false);
        System.out.println("Second time..");
        in.seek(0);
        IOUtils.copyBytes(in, System.out, 4096, false);
    } catch (IOException ex) {
        System.out.println(ex.getMessage());
    } finally {
        IOUtils.closeStream(in);
    }
}