Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:org.broadinstitute.sting.gatk.hadoop.SplittingBAMIndexer.java

License:Open Source License

public void index(final Path file, Configuration cfg) throws IOException {

    FileSystem fs = file.getFileSystem(cfg);

    final FSDataInputStream fin = fs.open(file);

    fin.seek(0);

    final BlockCompressedInputStream in = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(fin, fs.getFileStatus(file).getLen(), file));

    String pstr = file.toString();
    String nstr = pstr.replace(file.getName(), "_" + file.getName() + ".splitting-bai");

    FSDataOutputStream fout = fs.create(new Path(nstr), true);

    final OutputStream out = new BufferedOutputStream(fout);

    final LongBuffer lb = byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer();

    skipToAlignmentList(in);/*from www. j  a v a2  s .c o m*/

    // Always write the first one to make sure it's not skipped
    lb.put(0, in.getFilePointer());
    out.write(byteBuffer.array());

    long prevPrint = in.getFilePointer() >> 16;

    for (int i = 0;;) {
        final PtrSkipPair pair = readAlignment(in);
        if (pair == null)
            break;

        if (++i == granularity) {
            i = 0;
            lb.put(0, pair.ptr);
            out.write(byteBuffer.array());

            final long filePos = pair.ptr >> 16;
            if (filePos - prevPrint >= PRINT_EVERY) {
                System.out.print("-");
                prevPrint = filePos;
            }
        }
        fullySkip(in, pair.skip);
    }
    lb.put(0, fs.getFileStatus(file).getLen() << 16);
    out.write(byteBuffer.array());
    out.close();
    in.close();
}

From source file:org.cloudgraph.mapreduce.GraphXmlRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    // This InputSplit is a FileInputSplit
    FileSplit split = (FileSplit) inputSplit;

    this.context = context;
    this.configuration = context.getConfiguration();
    this.getCounter = Counters.retrieveGetCounterWithStringsParams(context);

    this.rootNamespaceUri = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_URI);
    this.rootNamespacePrefix = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_PREFIX, "ns1");

    this.unmarshalOptions = new DefaultOptions(this.rootNamespaceUri);
    this.unmarshalOptions.setRootNamespacePrefix(this.rootNamespacePrefix);
    this.unmarshalOptions.setValidate(false);
    this.unmarshalOptions.setFailOnValidationError(false);
    this.unmarshaler = new StreamUnmarshaller(this.unmarshalOptions, null);

    // Retrieve configuration, and Max allowed
    // bytes for a single record
    this.maxLineLength = configuration.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);

    // Split "S" is responsible for all records
    // starting from "start" and "end" positions
    start = split.getStart();// w  w w.ja  va2s.c  om
    end = start + split.getLength();

    // Retrieve file containing Split "S"
    final Path file = split.getPath();
    FileSystem fs = file.getFileSystem(this.configuration);
    FSDataInputStream fileIn = fs.open(split.getPath());

    // If Split "S" starts at byte 0, first line will be processed
    // If Split "S" does not start at byte 0, first line has been already
    // processed by "S-1" and therefore needs to be silently ignored
    boolean skipFirstLine = false;
    if (start != 0) {
        skipFirstLine = true;
        // Set the file pointer at "start - 1" position.
        // This is to make sure we won't miss any line
        // It could happen if "start" is located on a EOL
        --start;
        fileIn.seek(start);
    }

    in = new LineReader(fileIn, this.configuration);

    // If first line needs to be skipped, read first line
    // and stores its content to a dummy Text
    if (skipFirstLine) {
        Text dummy = new Text();
        // Reset "start" to "start + line offset"
        start += in.readLine(dummy, 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }

    // Position is the actual start
    this.pos = start;
}

From source file:org.commoncrawl.service.queryserver.query.DomainURLListQuery.java

License:Open Source License

private static void readPaginatedResults(final DatabaseIndexV2.MasterDatabaseIndex masterIndex, long domainId,
        FSDataInputStream inputStream, long length, String sortByField, int sortOrder, int pageNumber,
        int pageSize, QueryResult<URLFPV2, CrawlDatumAndMetadata> resultOut) throws IOException {
    // if descending sort order ... 
    // take pageNumber * pageSize as starting point
    long offset = 0;
    long startPos = 0;
    long endPos = 0;

    // calculate total record count ... 
    int totalRecordCount = (int) (length / FP_RECORD_SIZE);

    resultOut.getResults().clear();//from ww w .jav a2  s  .c o  m
    resultOut.setPageNumber(pageNumber);
    resultOut.setTotalRecordCount(totalRecordCount);

    // flip pr due to bug in how we sort pr 
    if (sortByField.equals(SORT_BY_PR)) {
        if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING)
            sortOrder = ClientQueryInfo.SortOrder.DESCENDING;
        else
            sortOrder = ClientQueryInfo.SortOrder.ASCENDING;

    }

    if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
        startPos = pageNumber * pageSize;
        endPos = Math.min(startPos + pageSize, totalRecordCount);
        offset = pageNumber * pageSize;
    } else {
        startPos = totalRecordCount - ((pageNumber + 1) * pageSize);
        endPos = startPos + pageSize;
        startPos = Math.max(0, startPos);
        offset = totalRecordCount - ((pageNumber + 1) * pageSize);
    }
    //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset);
    if (startPos < totalRecordCount) {

        //LOG.info("Seeking to Offset:" + startPos);
        inputStream.seek(startPos * FP_RECORD_SIZE);
        //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
        for (long i = startPos; i < endPos; ++i) {

            URLFPV2 key = new URLFPV2();

            key.setDomainHash(domainId);
            key.setUrlHash(inputStream.readLong());

            // ok time to find this item in the master index ... 
            CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();
            long timeStart = System.currentTimeMillis();
            MetadataOut metadataOut = masterIndex.queryMetadataAndURLGivenFP(key);
            long timeEnd = System.currentTimeMillis();

            //LOG.info("Metadata Retrieval for Index:"+ i + " took:" + (timeEnd - timeStart));

            if (metadataOut == null) {
                LOG.error("Failed to Retrieve URL and Metadata for Domain:" + domainId + " FP:"
                        + key.getUrlHash());
                metadataObject.setUrl("NULL-DH(" + key.getDomainHash() + ")-FP(" + key.getUrlHash() + ")");
            } else {
                metadataObject.setUrl(metadataOut.url.toString());
                metadataObject.setStatus(metadataOut.fetchStatus);
                if (metadataOut.lastFetchTime > 0) {
                    metadataObject.getMetadata().setLastFetchTimestamp(metadataOut.lastFetchTime);
                }
                metadataObject.getMetadata().setPageRank(metadataOut.pageRank);
            }

            if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
                resultOut.getResults().add(0,
                        new QueryResultRecord<URLFPV2, CrawlDatumAndMetadata>(key, metadataObject));
            } else {
                resultOut.getResults()
                        .add(new QueryResultRecord<URLFPV2, CrawlDatumAndMetadata>(key, metadataObject));
            }
        }
    }
}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private static void readPaginatedInlinkingDomainInfo(final DatabaseIndexV2.MasterDatabaseIndex masterIndex,
        FileSystem indexFileSystem, Path indexPath, Path detailPath, int sortOrder, int pageNumber,
        int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException {
    // if descending sort order ... 
    // take pageNumber * pageSize as starting point
    long offset = 0;
    long startPos = 0;
    long endPos = 0;

    FSDataInputStream indexStream = indexFileSystem.open(indexPath);

    try {// w w w . j  a  va  2  s .co m

        // read in the total record count ... 
        int totalRecordCount = indexStream.readInt();

        LOG.info("***RecordCount:" + totalRecordCount + " Allocating Buffer Of:" + (totalRecordCount * 4)
                + " bytes. FileLength:" + indexFileSystem.getFileStatus(indexPath).getLen());
        // read in index header data upfront 
        byte indexHeaderData[] = new byte[totalRecordCount * 4];
        // read it 
        indexStream.readFully(indexHeaderData);
        // mark string start pos 
        long detailStartPos = indexStream.getPos();
        // initialize index header reader stream 
        DataInputBuffer indexHeaderStream = new DataInputBuffer();
        indexHeaderStream.reset(indexHeaderData, 0, indexHeaderData.length);

        resultOut.getResults().clear();
        resultOut.setPageNumber(pageNumber);
        resultOut.setTotalRecordCount(totalRecordCount);

        if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
            startPos = pageNumber * pageSize;
            endPos = Math.min(startPos + pageSize, totalRecordCount);
            offset = pageNumber * pageSize;
        } else {
            startPos = totalRecordCount - ((pageNumber + 1) * pageSize);
            endPos = startPos + pageSize;
            startPos = Math.max(0, startPos);
            offset = totalRecordCount - ((pageNumber + 1) * pageSize);
        }
        //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset);
        if (startPos < totalRecordCount) {

            //LOG.info("Seeking to Offset:" + startPos);
            indexHeaderStream.skip(startPos * 4);
            //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
            for (long i = startPos; i < endPos; ++i) {

                // read data offset ... 
                int domainDataPos = indexHeaderStream.readInt();
                // seek to it 
                indexStream.seek(detailStartPos + domainDataPos);
                // read the detail data  
                InlinkingDomainInfo domainInfo = new InlinkingDomainInfo();
                domainInfo.readFields(indexStream);
                // ok extract name 
                String domainName = domainInfo.getDomainName();
                if (domainName.length() == 0) {
                    //TODO: NEED TO TRACK THIS DOWN 
                    domainName = "<<OOPS-NULL>>";
                }
                Text key = new Text(domainName);
                domainInfo.setFieldClean(InlinkingDomainInfo.Field_DOMAINNAME);

                if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
                    resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, domainInfo));
                } else {
                    resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, domainInfo));
                }
            }
        }
    } finally {
        indexStream.close();
    }
}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private static void readPaginatedResults(final DatabaseIndexV2.MasterDatabaseIndex masterIndex,
        FSDataInputStream inputStream, long length, int sortOrder, int pageNumber, int pageSize,
        QueryResult<Writable, Writable> resultOut) throws IOException {
    // if descending sort order ... 
    // take pageNumber * pageSize as starting point
    long offset = 0;
    long startPos = 0;
    long endPos = 0;

    // calculate total record count ... 
    int totalRecordCount = (int) (length / FP_RECORD_SIZE);

    resultOut.getResults().clear();//from ww w  .  ja  v a 2s . co  m
    resultOut.setPageNumber(pageNumber);
    resultOut.setTotalRecordCount(totalRecordCount);

    if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
        startPos = pageNumber * pageSize;
        endPos = Math.min(startPos + pageSize, totalRecordCount);
        offset = pageNumber * pageSize;
    } else {
        startPos = totalRecordCount - ((pageNumber + 1) * pageSize);
        endPos = startPos + pageSize;
        startPos = Math.max(0, startPos);
        offset = totalRecordCount - ((pageNumber + 1) * pageSize);
    }
    //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset);
    if (startPos < totalRecordCount) {

        //LOG.info("Seeking to Offset:" + startPos);
        inputStream.seek(startPos * FP_RECORD_SIZE);
        //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
        for (long i = startPos; i < endPos; ++i) {

            URLFPV2 key = new URLFPV2();

            key.setDomainHash(inputStream.readLong());
            key.setUrlHash(inputStream.readLong());

            // ok time to find this item in the master index ... 
            CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();
            MetadataOut metadataOut = masterIndex.queryMetadataAndURLGivenFP(key);

            if (metadataOut == null) {
                LOG.error("Failed to Retrieve URL and Metadata for Domain:" + key.getDomainHash() + " FP:"
                        + key.getUrlHash());
                metadataObject.setUrl("NULL-DH(" + key.getDomainHash() + ")-FP(" + key.getUrlHash() + ")");
            } else {
                metadataObject.setUrl(metadataOut.url.toString());
                metadataObject.setStatus(metadataOut.fetchStatus);
                if (metadataOut.lastFetchTime > 0) {
                    metadataObject.getMetadata().setLastFetchTimestamp(metadataOut.lastFetchTime);
                }
                metadataObject.getMetadata().setPageRank(metadataOut.pageRank);
            }

            if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
                resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, metadataObject));
            } else {
                resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, metadataObject));
            }
        }
    }
}

From source file:org.deepak.joins.CustomRecordReader.java

License:Apache License

public CustomRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    curSplit = split;//from   w  ww  . ja va 2s .  co  m
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.godhuli.rhipe.RXLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();// w  w  w .  ja  va  2 s  .  com
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine((new RHText()).getText(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
        // linecounter ++ ;
    }
    this.pos = start;
}

From source file:org.hedera.util.SeekableInputStream.java

License:Apache License

public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs,
        CompressionCodecFactory compressionCodecs) throws IOException {
    CompressionCodec codec = compressionCodecs.getCodec(path);
    FSDataInputStream din = fs.open(path);
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec;
            SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);
            return new SeekableInputStream(cin);
        } else {/* ww w  . jav a 2  s . co m*/
            // non-splittable compression input stream
            // no seeking or offsetting is needed
            assert start == 0;
            CompressionInputStream cin = codec.createInputStream(din, decompressor);
            return new SeekableInputStream(cin, din);
        }
    } else {
        // non compression input stream
        // we seek to the start of the split
        din.seek(start);
        return new SeekableInputStream(din);
    }
}

From source file:org.huahinframework.core.lib.input.SimpleRecordReader.java

License:Apache License

/**
 * {@inheritDoc}//from   ww w .  j a v a2s  .  c  om
 */
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }

    // skip first line and re-establish "start".
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }

    this.fileName = file.getName();
    this.fileLength = fs.getFileStatus(file).getLen();
    this.conf = context.getConfiguration();
    this.pos = start;
    this.separator = conf.get(SimpleJob.SEPARATOR, StringUtil.COMMA);
    this.regex = conf.getBoolean(SimpleJob.SEPARATOR_REGEX, false);

    init();
}

From source file:org.jclouds.examples.blobstore.hdfs.io.HdfsPayloadSlicer.java

License:Apache License

protected Payload doSlice(final FSDataInputStream inputStream, final long offset, final long length) {
    return new InputStreamSupplierPayload(new InputSupplier<InputStream>() {
        public InputStream getInput() throws IOException {
            if (offset > 0) {
                try {
                    inputStream.seek(offset);
                } catch (IOException e) {
                    Closeables.closeQuietly(inputStream);
                    throw e;
                }/*  w w w . j  a  v  a  2  s  .com*/
            }
            return new LimitInputStream(inputStream, length);
        }
    });
}