Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:org.archive.wayback.resourcestore.resourcefile.ResourceFactory.java

License:Apache License

public static Resource getResource(URI uri, long offset)
        throws IOException, ResourceNotAvailableException, URISyntaxException {

    Resource r = null;/*from   ww w  . j  a v  a2s . c o  m*/

    // FIXME: Put this into static initialization?  or require
    //        explicit init during startup?  Or just create it each
    //        time?
    // 

    // Attempt at fix: Only initializing file system once    
    if (hdfsSys == null) {
        Configuration conf = new Configuration();

        // Assume that the URL is a fully-qualified HDFS url, like:
        //   hdfs://namenode:6100/collections/foo/some.arc.gz
        // create fs with just the default URL

        URI defaultURI = new URI(uri.getScheme() + "://" + uri.getHost() + ":" + uri.getPort() + "/");
        hdfsSys = FileSystem.get(defaultURI, conf);
    }

    Path path = new Path(uri.getPath());

    FSDataInputStream is = hdfsSys.open(path);
    is.seek(offset);

    if (isArc(path.getName())) {
        ArchiveReader reader = ARCReaderFactory.get(path.getName(), is, false);
        r = ARCArchiveRecordToResource(reader.get(), reader);
    } else if (isWarc(path.getName())) {
        ArchiveReader reader = WARCReaderFactory.get(path.getName(), is, false);
        r = WARCArchiveRecordToResource(reader.get(), reader);
    } else {
        is.close();
        throw new ResourceNotAvailableException("Unknown extension");
    }

    return r;
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system./* w  ww . j  av a 2s.  co  m*/
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH);

    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    // if our codec is splittable, we can (tentatively) say that
    // we too are splittable.
    //
    // if we get a bgzfenhancedcodec, the codec might not actually
    // be splittable. however, if we get a non-splittable gz file,
    // several things happen:
    //
    // 1. the input format will detect this, and will not split the
    //    file
    // 2. the bgzfenhancedcodec will check the underlying data type
    //    (BGZF vs GZIP) at input stream creation time, and will
    //    apply the appropriate codec.
    //
    // if we get an unsplittable codec, really all that we do differently
    // is skip the positioning check, since we know that we're at the
    // start of the file and can get to reading immediately
    isSplittable = (codec instanceof SplittableCompressionCodec);

    if (codec == null) {
        // no codec.  Uncompressed file.
        int bytesToSkip = positionAtFirstRecord(fileIn, null);
        inputStream = fileIn;
        inputStream.skip(bytesToSkip);
        lineReader = new LineReader(inputStream);
    } else if (isSplittable) {
        // file is compressed, but uses a splittable codec
        isCompressed = true;
        int bytesToSkip = positionAtFirstRecord(fileIn, codec);

        // apparent fun finding: if you don't seek back to 0,
        // SplittableCompressionCodec.createInputStream will seek in the stream
        // to a start position, and funny things happen..
        fileIn.seek(0);
        inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(),
                start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);

        inputStream.skip(bytesToSkip);
        lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf);
    } else {
        // unsplittable compressed file
        // expect a single split, first record at offset 0
        isCompressed = true;
        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
        lineReader = new LineReader(inputStream);
    }
}

From source file:org.bdgenomics.adam.io.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *//*from  w ww.j  ava  2 s . c o  m*/
protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec)
        throws IOException {
    Text buffer = new Text();
    long originalStart = start;

    LineReader reader;
    if (codec == null) {
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        reader = new LineReader(stream);
    } else {
        // Unlike the codec == null case, we don't seek before creating the
        // reader, SplittableCompressionCodec.createInputStream places the
        // stream at the start of the first compression block after our
        // split start
        //
        // as noted above, we need to be at pos 0 in the stream before
        // calling this
        reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end,
                SplittableCompressionCodec.READ_MODE.BYBLOCK));
    }

    int bytesRead = 0;
    do {
        bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
        int bufferLength = buffer.getLength();
        if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
            start += bytesRead;
        } else {

            // line starts with @.  Read two more and verify that it starts
            // with a +:
            //
            // @<readname>
            // <sequence>
            // +[readname]
            //
            // if the second line we read starts with a @, we know that
            // we've read:
            //
            // <qualities> <-- @ is a valid ASCII phred encoding
            // @<readname>
            //
            // and thus, the second read is the delimiter and we can break
            long trackForwardPosition = start + bytesRead;

            bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
            if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') {
                start = trackForwardPosition;
                break;
            } else {
                trackForwardPosition += bytesRead;
            }

            bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start));
            trackForwardPosition += bytesRead;
            if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                break; // all good!
            } else {
                start = trackForwardPosition;
            }
        }
    } while (bytesRead > 0);

    pos = start;
    start = originalStart;
    stream.seek(start);
    return (int) (pos - originalStart);
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.adaptor.AdaptorRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println(split.toString());
    Configuration job = context.getConfiguration();
    System.err.println(split.getPath().toString());
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from   www .  jav a2  s . c om
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.bam.GaeaBamRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized)
        close();/*from   www. ja  v  a 2 s .c  o  m*/
    isInitialized = true;

    final Configuration conf = ContextUtil.getConfiguration(ctx);

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf));

    in.seek(0);
    bci = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    final long virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (GaeaBamInputFormat.DEBUG_BAM_SPLITTER) {
        final long recordStart = virtualStart & 0xffff;
        System.err.println(
                "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart);
    }
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.fastq.FastqBasicReader.java

License:Open Source License

public FastqBasicReader(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(split.getPath());

    String multiSampleList = job.get("multiSampleList");
    if (multiSampleList != null && multiSampleList != "") {
        FastqMultipleSample samplelist;//www.  jav  a  2 s .c om
        samplelist = new FastqMultipleSample(multiSampleList, false);
        FastqSample slist = samplelist.getID(split.getPath().toString());
        if (slist != null) {
            sampleID = String.valueOf(slist.getId());
        } else {
            sampleID = "+";
        }
    }

    start = split.getStart();
    end = split.getStart() + split.getLength();

    // open the file and seek to the start of the split
    FileSystem fs = split.getPath().getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
    }

    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    getFirstFastqLine();
    this.pos = start;
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.vcf.VCFRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext ctx) throws IOException, InterruptedException {
    conf = ctx.getConfiguration();/*w w w .j  av  a 2s. co  m*/
    FileSplit split = (FileSplit) inputSplit;
    start = split.getStart();
    this.length = split.getLength();
    file = split.getPath();
    fileID = mVcfHeader.getId(file.toString());
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream is = fs.open(file);

    reader = new AsciiLineReader(is);
    it = new AsciiLineReaderIterator(reader);

    Object header = codec.readHeader(it);
    if (!(header instanceof FeatureCodecHeader)
            || !(((FeatureCodecHeader) header).getHeaderValue() instanceof VCFHeader)) {
        throw new IOException("No VCF header found in " + file);
    }

    if (start != 0) {
        is.seek(start - 1);
        reader = new AsciiLineReader(is);
        reader.readLine();
        it = new AsciiLineReaderIterator(reader);
    } else {
        currentPos = it.getPosition();
        is.seek(0);
        reader = new AsciiLineReader(is);
        it = new AsciiLineReaderIterator(reader);
        while (keepReading(it, currentPos)) {
            it.next();
        }

        if (!it.hasNext() || it.getPosition() > currentPos) {
            throw new IOException("Empty vcf file " + file);
        }
    }
}

From source file:org.broadinstitute.sting.gatk.hadoop.BAMRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    final FileVirtualSplit split = (FileVirtualSplit) spl;

    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(ctx.getConfiguration());

    final FSDataInputStream in = fs.open(file);
    codec = new BAMRecordCodec(new SAMFileReader(in).getFileHeader());

    in.seek(0);
    bci = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    final long virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);//ww w  .j  a  va2s  .c  o  m
    codec.setInputStream(bci);
}

From source file:org.broadinstitute.sting.gatk.hadoop.hadoopsrc.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    XFileSplit split = (XFileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    this.start = split.getStart();
    fileIn.seek(this.start);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    in = new LineReader(fileIn, job);
    this.pos = start;
    this.end = Long.MAX_VALUE;
}

From source file:org.broadinstitute.sting.gatk.hadoop.LociRecordReader.java

License:Open Source License

public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    final FileVirtualSplit split = (FileVirtualSplit) spl;

    file = split.getPath();//w  w  w  . j  av  a  2 s  .co  m
    fs = file.getFileSystem(ctx.getConfiguration());

    final FSDataInputStream in = fs.open(file);
    codec = new BAMRecordCodec(new SAMFileReader(in).getFileHeader());

    in.seek(0);
    bci = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();
    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();
    fileEnd = virtualEnd >>> 16;
    idx = new SplittingBAMIndex(file.getFileSystem(ctx.getConfiguration()).open(getIdxPath(file)));
    codec.setInputStream(bci);
    bci.seek(virtualStart);

    JobConf job = new JobConf(ctx.getConfiguration());
    jobDir = new String(job.getJobLocalDir());
    attemptID = ctx.getTaskAttemptID().toString();
}