Java tutorial
// Copyright (C) 2011-2012 CRS4. // // This file is part of Hadoop-BAM. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. package fi.tkk.ics.hadoop.bam; import hbparquet.hadoop.util.ContextUtil; import java.io.IOException; import java.io.InputStream; import java.nio.charset.CharacterCodingException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import fi.tkk.ics.hadoop.bam.FormatConstants.BaseQualityEncoding; import fi.tkk.ics.hadoop.bam.util.ConfHelper; /** * Reads the Illumina qseq sequence format. * Key: instrument, run number, lane, tile, xpos, ypos, read number, delimited by ':' characters. * Value: a SequencedFragment object representing the entry. */ public class QseqInputFormat extends FileInputFormat<Text, SequencedFragment> { public static final String CONF_BASE_QUALITY_ENCODING = "hbam.qseq-input.base-quality-encoding"; public static final String CONF_FILTER_FAILED_QC = "hbam.qseq-input.filter-failed-qc"; public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "illumina"; public static class QseqRecordReader extends RecordReader<Text, SequencedFragment> { /* * qseq format: * 11 tab-separated columns * * 1) Instrument * 2) Run id * 3) Lane number * 4) Tile number * 5) X pos * 6) Y pos * 7) Index sequence (0 for runs without multiplexing) * 8) Read Number * 9) Base Sequence * 10) Base Quality * 11) Filter: did the read pass filtering? 0 - No, 1 - Yes. */ // start: first valid data index private long start; // end: first index value beyond the slice, i.e. slice is in range [start,end) private long end; // pos: current position in file private long pos; // file: the file being read private Path file; private LineReader lineReader; private InputStream inputStream; private Text currentKey = new Text(); private SequencedFragment currentValue = new SequencedFragment(); private Text buffer = new Text(); private static final int NUM_QSEQ_COLS = 11; // for these, we have one per qseq field private int[] fieldPositions = new int[NUM_QSEQ_COLS]; private int[] fieldLengths = new int[NUM_QSEQ_COLS]; private BaseQualityEncoding qualityEncoding; private boolean filterFailedQC = false; private static final String Delim = "\t"; // How long can a qseq line get? public static final int MAX_LINE_LENGTH = 20000; public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException( "Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); } /* * Position the input stream at the start of the first record. */ private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { if (start > 0) { // Advance to the start of the first line in our slice. // We use a temporary LineReader to read a partial line and find the // start of the first one on or after our starting position. // In case our slice starts right at the beginning of a line, we need to back // up by one position and then discard the first line. start -= 1; stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); start = start + bytesRead; stream.seek(start); } // else // if start == 0 we're starting at the beginning of a line pos = start; } protected void setConf(Configuration conf) { String encoding = conf.get(QseqInputFormat.CONF_BASE_QUALITY_ENCODING, conf.get(FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, CONF_BASE_QUALITY_ENCODING_DEFAULT)); if ("illumina".equals(encoding)) qualityEncoding = BaseQualityEncoding.Illumina; else if ("sanger".equals(encoding)) qualityEncoding = BaseQualityEncoding.Sanger; else throw new RuntimeException("Unknown input base quality encoding value " + encoding); filterFailedQC = ConfHelper.parseBoolean(conf.get(QseqInputFormat.CONF_FILTER_FAILED_QC, conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)), false); } /** * Added to use mapreduce API. */ public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { } /** * Added to use mapreduce API. */ public Text getCurrentKey() { return currentKey; } /** * Added to use mapreduce API. */ public SequencedFragment getCurrentValue() { return currentValue; } /** * Added to use mapreduce API. */ public boolean nextKeyValue() throws IOException, InterruptedException { return next(currentKey, currentValue); } /** * Close this RecordReader to future operations. */ public void close() throws IOException { inputStream.close(); } /** * Create an object of the appropriate type to be used as a key. */ public Text createKey() { return new Text(); } /** * Create an object of the appropriate type to be used as a value. */ public SequencedFragment createValue() { return new SequencedFragment(); } /** * Returns the current position in the input. */ public long getPos() { return pos; } /** * How much of the input has the RecordReader consumed i.e. */ public float getProgress() { if (start == end) return 1.0f; else return Math.min(1.0f, (pos - start) / (float) (end - start)); } public String makePositionMessage(long pos) { return file.toString() + ":" + pos; } public String makePositionMessage() { return file.toString() + ":" + pos; } /* * Read a single record. * * Reads a single line of input and scans it with scanQseqLine, which * sets key and value accordingly. The method updates this.pos. * * @return The number of bytes read. If no bytes were read, the EOF was reached. */ private int lowLevelQseqRead(Text key, SequencedFragment value) throws IOException { int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); pos += bytesRead; if (bytesRead >= MAX_LINE_LENGTH) { String line; try { line = Text.decode(buffer.getBytes(), 0, 500); } catch (java.nio.charset.CharacterCodingException e) { line = "(line not convertible to printable format)"; } throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + line); } else if (bytesRead > 0) scanQseqLine(buffer, key, value); return bytesRead; } /** * Reads the next key/value pair from the input for processing. */ public boolean next(Text key, SequencedFragment value) throws IOException { if (pos >= end) return false; // past end of slice int bytesRead = 0; boolean goodRecord; do { bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (bytesRead > 0 && !goodRecord); if (goodRecord) // post process the record only if it's going to be used { try { postProcessSequencedFragment(value); } catch (FormatException e) { throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) + "; line: " + buffer); // last line read is still in the buffer } } return goodRecord; } /* * Scans the text line to find the position and the lengths of the fields * within it. The positions and lengths are saved into the instance arrays * 'fieldPositions' and 'fieldLengths'. * * @exception FormatException Line doesn't have the expected number of fields. */ private void setFieldPositionsAndLengths(Text line) { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field { int endpos = line.find(Delim, pos); // the field's end position if (endpos < 0) endpos = line.getLength(); fieldPositions[fieldno] = pos; fieldLengths[fieldno] = endpos - pos; pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } if (fieldno != NUM_QSEQ_COLS) throw new FormatException("found " + fieldno + " fields instead of 11 at " + makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); } private void scanQseqLine(Text line, Text key, SequencedFragment fragment) { setFieldPositionsAndLengths(line); // Build the key. We concatenate all fields from 0 to 5 (machine to y-pos) // and then the read number, replacing the tabs with colons. key.clear(); // append up and including field[5] key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]); // replace tabs with : byte[] bytes = key.getBytes(); int temporaryEnd = key.getLength(); for (int i = 0; i < temporaryEnd; ++i) if (bytes[i] == '\t') bytes[i] = ':'; // append the read number key.append(line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab. // convert the tab preceding the read number into a : key.getBytes()[temporaryEnd] = ':'; // now the fragment try { fragment.clear(); fragment.setInstrument(Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0])); fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1]))); //fragment.setFlowcellId(); fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2]))); fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3]))); fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4]))); fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5]))); fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7]))); fragment.setFilterPassed(line.getBytes()[fieldPositions[10]] != '0'); //fragment.setControlNumber(); if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence fragment.setIndexSequence(null); else fragment.setIndexSequence( Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N')); } catch (CharacterCodingException e) { throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line); } fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]); fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]); } /* * This method applies some transformations to the read and quality data. * * <ul> * <li>'.' in the read are converted to 'N'</li> * <li>the base quality encoding is converted to 'sanger', unless otherwise * requested by the configuration.</li> * </ul> * * @exception FormatException Thrown if the record contains base quality scores * outside the range allowed by the format. */ private void postProcessSequencedFragment(SequencedFragment fragment) { byte[] bytes = fragment.getSequence().getBytes(); // replace . with N for (int i = 0; i < fieldLengths[8]; ++i) if (bytes[i] == '.') bytes[i] = 'N'; if (qualityEncoding == BaseQualityEncoding.Illumina) { // convert illumina to sanger scale SequencedFragment.convertQuality(fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); } else // sanger qualities. { int outOfRangeElement = SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger); if (outOfRangeElement >= 0) { throw new FormatException( "qseq base quality score out of range for Sanger Phred+33 format (found " + (fragment.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"); } } } } @Override public boolean isSplitable(JobContext context, Path path) { CompressionCodec codec = new CompressionCodecFactory(ContextUtil.getConfiguration(context)).getCodec(path); return codec == null; } public RecordReader<Text, SequencedFragment> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); return new QseqRecordReader(ContextUtil.getConfiguration(context), (FileSplit) genericSplit); // cast as per example in TextInputFormat } }