Java tutorial
// Copyright (c) 2012 Aalto University // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. // File created: 2012-02-02 11:39:46 package org.seqdoop.hadoop_bam; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.IOException; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMFormatException; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SAMTextHeaderCodec; import htsjdk.samtools.ValidationStringency; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; import hbparquet.hadoop.util.ContextUtil; /** See {@link BAMRecordReader} for the meaning of the key. */ public class SAMRecordReader extends RecordReader<LongWritable, SAMRecordWritable> { private LongWritable key = new LongWritable(); private SAMRecordWritable record = new SAMRecordWritable(); private FSDataInputStream input; private SAMRecordIterator iterator; private long start, end; private boolean isInitialized = false; private WorkaroundingStream waInput; @Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link SAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close(); isInitialized = true; final FileSplit split = (FileSplit) spl; this.start = split.getStart(); this.end = start + split.getLength(); final Configuration conf = ContextUtil.getConfiguration(ctx); final ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf); final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); input = fs.open(file); // SAMFileReader likes to make our life difficult, so complexity ensues. // The basic problem is that SAMFileReader buffers its input internally, // which causes two issues. // // Issue #1 is that SAMFileReader requires that its input begins with a // SAM header. This is not fine for reading from the middle of a file. // Because of the buffering, if we have the reader read the header from // the beginning of the file and then seek to where we want to read // records from, it'll have buffered some records from immediately after // the header, which is no good. Thus we need to read the header // separately and then use a custom stream that wraps the input stream, // inserting the header at the beginning of it. (Note the spurious // re-encoding of the header so that the reader can decode it.) // // Issue #2 is handling the boundary between two input splits. The best // way seems to be the classic "in later splits, skip the first line, and // in every split finish reading a partial line at the end of the split", // but that latter part is a bit complicated here. Due to the buffering, // we can easily overshoot: as soon as the stream moves past the end of // the split, SAMFileReader has buffered some records past the end. The // basic fix here is to have our custom stream count the number of bytes // read and to stop after the split size. Unfortunately this prevents us // from reading the last partial line, so our stream actually allows // reading to the next newline after the actual end. ValidationStringency origStringency = null; try { if (stringency != null) { origStringency = SAMFileReader.getDefaultValidationStringency(); SAMFileReader.setDefaultValidationStringency(stringency); } final SAMFileHeader header = new SAMFileReader(input, false).getFileHeader(); waInput = new WorkaroundingStream(input, header); final boolean firstSplit = this.start == 0; if (firstSplit) { // Skip the header because we already have it, and adjust the start // to match. final int headerLength = waInput.getRemainingHeaderLength(); input.seek(headerLength); this.start += headerLength; } else input.seek(--this.start); // Creating the iterator causes reading from the stream, so make sure // to start counting this early. waInput.setLength(this.end - this.start); iterator = new SAMFileReader(waInput, false).iterator(); if (!firstSplit) { // Skip the first line, it'll be handled with the previous split. try { if (iterator.hasNext()) iterator.next(); } catch (SAMFormatException e) { } } } finally { if (origStringency != null) SAMFileReader.setDefaultValidationStringency(origStringency); } } @Override public void close() throws IOException { iterator.close(); } @Override public float getProgress() throws IOException { final long pos = input.getPos(); if (pos >= end) return 1; else return (float) (pos - start) / (end - start); } @Override public LongWritable getCurrentKey() { return key; } @Override public SAMRecordWritable getCurrentValue() { return record; } @Override public boolean nextKeyValue() { if (!iterator.hasNext()) return false; final SAMRecord r = iterator.next(); key.set(BAMRecordReader.getKey(r)); record.set(r); return true; } } // See the long comment in SAMRecordReader.initialize() for what this does. class WorkaroundingStream extends InputStream { private final InputStream stream, headerStream; private boolean headerRemaining; private long length; private int headerLength; private boolean lookingForEOL = false, foundEOL = false, strippingAts = false; // HACK, see read(byte[], int, int). public WorkaroundingStream(InputStream stream, SAMFileHeader header) { this.stream = stream; String text = header.getTextHeader(); if (text == null) { StringWriter writer = new StringWriter(); new SAMTextHeaderCodec().encode(writer, header); text = writer.toString(); } byte[] b; try { b = text.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { b = null; assert false; } headerRemaining = true; headerLength = b.length; headerStream = new ByteArrayInputStream(b); this.length = Long.MAX_VALUE; } public void setLength(long length) { this.length = length; } public int getRemainingHeaderLength() { return headerLength; } private byte[] readBuf = new byte[1]; @Override public int read() throws IOException { for (;;) switch (read(readBuf)) { case 0: continue; case 1: return readBuf[0]; case -1: return -1; } } @Override public int read(byte[] buf, int off, int len) throws IOException { if (!headerRemaining) return streamRead(buf, off, len); int h; if (strippingAts) h = 0; else { h = headerStream.read(buf, off, len); if (h == -1) { // This should only happen when there was no header at all, in // which case Picard doesn't throw an error until trying to read // a record, for some reason. (Perhaps an oversight.) Thus we // need to handle that case here. assert (headerLength == 0); h = 0; } else if (h < headerLength) { headerLength -= h; return h; } strippingAts = true; headerStream.close(); } final int newOff = off + h; int s = streamRead(buf, newOff, len - h); if (s <= 0) return strippingAts ? s : h; // HACK HACK HACK. // // We gave all of the header, which means that SAMFileReader is still // trying to read more header lines. If we're in a split that isn't at // the start of the SAM file, we could be in the middle of a line and // thus see @ characters at the start of our data. Then SAMFileReader // would try to understand those as header lines and the end result is // that it throws an error, since they aren't actually header lines, // they're just part of a SAM record. // // So, if we're done with the header, strip all @ characters we see. Thus // SAMFileReader will stop reading the header there and won't throw an // exception until we use its SAMRecordIterator, at which point we can // catch it, because we know to expect it. // // headerRemaining remains true while it's possible that there are still // @ characters coming. int i = newOff - 1; while (buf[++i] == '@' && --s > 0) ; if (i != newOff) System.arraycopy(buf, i, buf, newOff, s); headerRemaining = s == 0; return h + s; } private int streamRead(byte[] buf, int off, int len) throws IOException { if (len > length) { if (foundEOL) return 0; lookingForEOL = true; } int n = stream.read(buf, off, len); if (n > 0) { n = tryFindEOL(buf, off, n); length -= n; } return n; } private int tryFindEOL(byte[] buf, int off, int len) { assert !foundEOL; if (!lookingForEOL || len < length) return len; // Find the first EOL between length and len. // len >= length so length fits in an int. int i = Math.max(0, (int) length - 1); for (; i < len; ++i) { if (buf[off + i] == '\n') { foundEOL = true; return i + 1; } } return len; } @Override public void close() throws IOException { stream.close(); } @Override public int available() throws IOException { return headerRemaining ? headerStream.available() : stream.available(); } }