List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:org.archive.wayback.resourcestore.resourcefile.ResourceFactory.java
License:Apache License
public static Resource getResource(URI uri, long offset) throws IOException, ResourceNotAvailableException, URISyntaxException { Resource r = null;/*from ww w . j a v a2s . c o m*/ // FIXME: Put this into static initialization? or require // explicit init during startup? Or just create it each // time? // // Attempt at fix: Only initializing file system once if (hdfsSys == null) { Configuration conf = new Configuration(); // Assume that the URL is a fully-qualified HDFS url, like: // hdfs://namenode:6100/collections/foo/some.arc.gz // create fs with just the default URL URI defaultURI = new URI(uri.getScheme() + "://" + uri.getHost() + ":" + uri.getPort() + "/"); hdfsSys = FileSystem.get(defaultURI, conf); } Path path = new Path(uri.getPath()); FSDataInputStream is = hdfsSys.open(path); is.seek(offset); if (isArc(path.getName())) { ArchiveReader reader = ARCReaderFactory.get(path.getName(), is, false); r = ARCArchiveRecordToResource(reader.get(), reader); } else if (isWarc(path.getName())) { ArchiveReader reader = WARCReaderFactory.get(path.getName(), is, false); r = WARCArchiveRecordToResource(reader.get(), reader); } else { is.close(); throw new ResourceNotAvailableException("Unknown extension"); } return r; }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system./* w ww . j av a 2s. co m*/ * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); // if our codec is splittable, we can (tentatively) say that // we too are splittable. // // if we get a bgzfenhancedcodec, the codec might not actually // be splittable. however, if we get a non-splittable gz file, // several things happen: // // 1. the input format will detect this, and will not split the // file // 2. the bgzfenhancedcodec will check the underlying data type // (BGZF vs GZIP) at input stream creation time, and will // apply the appropriate codec. // // if we get an unsplittable codec, really all that we do differently // is skip the positioning check, since we know that we're at the // start of the file and can get to reading immediately isSplittable = (codec instanceof SplittableCompressionCodec); if (codec == null) { // no codec. Uncompressed file. int bytesToSkip = positionAtFirstRecord(fileIn, null); inputStream = fileIn; inputStream.skip(bytesToSkip); lineReader = new LineReader(inputStream); } else if (isSplittable) { // file is compressed, but uses a splittable codec isCompressed = true; int bytesToSkip = positionAtFirstRecord(fileIn, codec); // apparent fun finding: if you don't seek back to 0, // SplittableCompressionCodec.createInputStream will seek in the stream // to a start position, and funny things happen.. fileIn.seek(0); inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(), start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); inputStream.skip(bytesToSkip); lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf); } else { // unsplittable compressed file // expect a single split, first record at offset 0 isCompressed = true; inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file lineReader = new LineReader(inputStream); } }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *//*from w ww.j ava 2 s . c o m*/ protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec) throws IOException { Text buffer = new Text(); long originalStart = start; LineReader reader; if (codec == null) { // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); reader = new LineReader(stream); } else { // Unlike the codec == null case, we don't seek before creating the // reader, SplittableCompressionCodec.createInputStream places the // stream at the start of the first compression block after our // split start // // as noted above, we need to be at pos 0 in the stream before // calling this reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK)); } int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts // with a +: // // @<readname> // <sequence> // +[readname] // // if the second line we read starts with a @, we know that // we've read: // // <qualities> <-- @ is a valid ASCII phred encoding // @<readname> // // and thus, the second read is the delimiter and we can break long trackForwardPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') { start = trackForwardPosition; break; } else { trackForwardPosition += bytesRead; } bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); trackForwardPosition += bytesRead; if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { start = trackForwardPosition; } } } while (bytesRead > 0); pos = start; start = originalStart; stream.seek(start); return (int) (pos - originalStart); }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.adaptor.AdaptorRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; System.out.println(split.toString()); Configuration job = context.getConfiguration(); System.err.println(split.getPath().toString()); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from www . jav a2 s . c om end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.bam.GaeaBamRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link BAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close();/*from www. ja v a 2 s .c o m*/ isInitialized = true; final Configuration conf = ContextUtil.getConfiguration(ctx); final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); this.stringency = SAMHeaderReader.getValidationStringency(conf); final FSDataInputStream in = fs.open(file); codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf)); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); final long virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart); codec.setInputStream(bci); if (GaeaBamInputFormat.DEBUG_BAM_SPLITTER) { final long recordStart = virtualStart & 0xffff; System.err.println( "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart); } }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.fastq.FastqBasicReader.java
License:Open Source License
public FastqBasicReader(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(split.getPath()); String multiSampleList = job.get("multiSampleList"); if (multiSampleList != null && multiSampleList != "") { FastqMultipleSample samplelist;//www. jav a 2 s .c om samplelist = new FastqMultipleSample(multiSampleList, false); FastqSample slist = samplelist.getID(split.getPath().toString()); if (slist != null) { sampleID = String.valueOf(slist.getId()); } else { sampleID = "+"; } } start = split.getStart(); end = split.getStart() + split.getLength(); // open the file and seek to the start of the split FileSystem fs = split.getPath().getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } getFirstFastqLine(); this.pos = start; }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.vcf.VCFRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext ctx) throws IOException, InterruptedException { conf = ctx.getConfiguration();/*w w w .j av a 2s. co m*/ FileSplit split = (FileSplit) inputSplit; start = split.getStart(); this.length = split.getLength(); file = split.getPath(); fileID = mVcfHeader.getId(file.toString()); FileSystem fs = file.getFileSystem(conf); FSDataInputStream is = fs.open(file); reader = new AsciiLineReader(is); it = new AsciiLineReaderIterator(reader); Object header = codec.readHeader(it); if (!(header instanceof FeatureCodecHeader) || !(((FeatureCodecHeader) header).getHeaderValue() instanceof VCFHeader)) { throw new IOException("No VCF header found in " + file); } if (start != 0) { is.seek(start - 1); reader = new AsciiLineReader(is); reader.readLine(); it = new AsciiLineReaderIterator(reader); } else { currentPos = it.getPosition(); is.seek(0); reader = new AsciiLineReader(is); it = new AsciiLineReaderIterator(reader); while (keepReading(it, currentPos)) { it.next(); } if (!it.hasNext() || it.getPosition() > currentPos) { throw new IOException("Empty vcf file " + file); } } }
From source file:org.broadinstitute.sting.gatk.hadoop.BAMRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(ctx.getConfiguration()); final FSDataInputStream in = fs.open(file); codec = new BAMRecordCodec(new SAMFileReader(in).getFileHeader()); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); final long virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart);//ww w .j a va2s .c o m codec.setInputStream(bci); }
From source file:org.broadinstitute.sting.gatk.hadoop.hadoopsrc.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { XFileSplit split = (XFileSplit) genericSplit; Configuration job = context.getConfiguration(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); this.start = split.getStart(); fileIn.seek(this.start); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); in = new LineReader(fileIn, job); this.pos = start; this.end = Long.MAX_VALUE; }
From source file:org.broadinstitute.sting.gatk.hadoop.LociRecordReader.java
License:Open Source License
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { final FileVirtualSplit split = (FileVirtualSplit) spl; file = split.getPath();//w w w . j av a 2 s .co m fs = file.getFileSystem(ctx.getConfiguration()); final FSDataInputStream in = fs.open(file); codec = new BAMRecordCodec(new SAMFileReader(in).getFileHeader()); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); fileEnd = virtualEnd >>> 16; idx = new SplittingBAMIndex(file.getFileSystem(ctx.getConfiguration()).open(getIdxPath(file))); codec.setInputStream(bci); bci.seek(virtualStart); JobConf job = new JobConf(ctx.getConfiguration()); jobDir = new String(job.getJobLocalDir()); attemptID = ctx.getTaskAttemptID().toString(); }