fi.tkk.ics.hadoop.bam.FastqInputFormat.java Source code

Introduction

Here is the source code for fi.tkk.ics.hadoop.bam.FastqInputFormat.java
Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

package fi.tkk.ics.hadoop.bam;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.InputStream;
import java.io.IOException;
import java.io.EOFException;

import java.util.regex.*;

import fi.tkk.ics.hadoop.bam.FormatConstants.BaseQualityEncoding;
import fi.tkk.ics.hadoop.bam.util.ConfHelper;

import hbparquet.hadoop.util.ContextUtil;

public class FastqInputFormat extends FileInputFormat<Text, SequencedFragment> {
    public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-input.base-quality-encoding";
    public static final String CONF_FILTER_FAILED_QC = "hbam.fastq-input.filter-failed-qc";
    public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger";

    public static class FastqRecordReader extends RecordReader<Text, SequencedFragment> {
        /*
         * fastq format:
         * <fastq>   :=   <block>+
         * <block>   :=   @<seqname>\n<seq>\n+[<seqname>]\n<qual>\n
         * <seqname>   :=   [A-Za-z0-9_.:-]+
         * <seq>   :=   [A-Za-z\n\.~]+
         * <qual>   :=   [!-~\n]+
         *
         * LP: this format is broken, no?  You can have multi-line sequence and quality strings,
         * and the quality encoding includes '@' in its valid character range.  So how should one
         * distinguish between \n@ as a record delimiter and and \n@ as part of a multi-line
         * quality string?
         *
         * For now I'm going to assume single-line sequences.  This works for our sequencing
         * application.  We'll see if someone complains in other applications.
         */

        // start:  first valid data index
        private long start;
        // end:  first index value beyond the slice, i.e. slice is in range [start,end)
        private long end;
        // pos: current position in file
        private long pos;
        // file:  the file being read
        private Path file;

        private LineReader lineReader;
        private InputStream inputStream;
        private Text currentKey = new Text();
        private SequencedFragment currentValue = new SequencedFragment();

        /* If true, will scan the identifier for read data as specified in the Casava
         * users' guide v1.8:
         * @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>
         * After the first name that doesn't match lookForIlluminaIdentifier will be
         * set to false and no further scanning will be done.
         */
        private boolean lookForIlluminaIdentifier = true;
        private static final Pattern ILLUMINA_PATTERN = Pattern
                .compile("([^:]+):(\\d+):([^:]*):(\\d+):(\\d+):(-?\\d+):(-?\\d+)\\s+([123]):([YN]):(\\d+):(.*)");

        private Text buffer = new Text();

        private BaseQualityEncoding qualityEncoding;
        private boolean filterFailedQC = false;

        // How long can a read get?
        private static final int MAX_LINE_LENGTH = 10000;

        public FastqRecordReader(Configuration conf, FileSplit split) throws IOException {
            setConf(conf);
            file = split.getPath();
            start = split.getStart();
            end = start + split.getLength();

            FileSystem fs = file.getFileSystem(conf);
            FSDataInputStream fileIn = fs.open(file);

            CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = codecFactory.getCodec(file);

            if (codec == null) // no codec.  Uncompressed file.
            {
                positionAtFirstRecord(fileIn);
                inputStream = fileIn;
            } else { // compressed file
                if (start != 0)
                    throw new RuntimeException(
                            "Start position for compressed file is not 0! (found " + start + ")");

                inputStream = codec.createInputStream(fileIn);
                end = Long.MAX_VALUE; // read until the end of the file
            }

            lineReader = new LineReader(inputStream);
        }

        protected void setConf(Configuration conf) {
            String encoding = conf.get(FastqInputFormat.CONF_BASE_QUALITY_ENCODING,
                    conf.get(FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING,
                            FastqInputFormat.CONF_BASE_QUALITY_ENCODING_DEFAULT));

            if ("illumina".equals(encoding))
                qualityEncoding = BaseQualityEncoding.Illumina;
            else if ("sanger".equals(encoding))
                qualityEncoding = BaseQualityEncoding.Sanger;
            else
                throw new RuntimeException("Unknown input base quality encoding value " + encoding);

            filterFailedQC = ConfHelper.parseBoolean(conf.get(FastqInputFormat.CONF_FILTER_FAILED_QC,
                    conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)), false);
        }

        /*
         * Position the input stream at the start of the first record.
         */
        private void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
            if (start > 0) {
                // Advance to the start of the first record
                // We use a temporary LineReader to read lines until we find the
                // position of the right one.  We then seek the file to that position.
                stream.seek(start);
                LineReader reader = new LineReader(stream);

                int bytesRead = 0;
                do {
                    bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                    if (bytesRead > 0 && (buffer.getLength() <= 0 || buffer.getBytes()[0] != '@'))
                        start += bytesRead;
                    else {
                        // line starts with @.  Read two more and verify that it starts with a +
                        //
                        // If this isn't the start of a record, we want to backtrack to its end
                        long backtrackPosition = start + bytesRead;

                        bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                        bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                        if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+')
                            break; // all good!
                        else {
                            // backtrack to the end of the record we thought was the start.
                            start = backtrackPosition;
                            stream.seek(start);
                            reader = new LineReader(stream);
                        }
                    }
                } while (bytesRead > 0);

                stream.seek(start);
            }
            // else
            //   if start == 0 we presume it starts with a valid fastq record
            pos = start;
        }

        /**
         * Added to use mapreduce API.
         */
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
        }

        /**
         * Added to use mapreduce API.
         */
        public Text getCurrentKey() {
            return currentKey;
        }

        /**
         * Added to use mapreduce API.
         */
        public SequencedFragment getCurrentValue() {
            return currentValue;
        }

        /**
         * Added to use mapreduce API.
         */
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return next(currentKey, currentValue);
        }

        /**
         * Close this RecordReader to future operations.
         */
        public void close() throws IOException {
            inputStream.close();
        }

        /**
         * Create an object of the appropriate type to be used as a key.
         */
        public Text createKey() {
            return new Text();
        }

        /**
         * Create an object of the appropriate type to be used as a value.
         */
        public SequencedFragment createValue() {
            return new SequencedFragment();
        }

        /**
         * Returns the current position in the input.
         */
        public long getPos() {
            return pos;
        }

        /**
         * How much of the input has the RecordReader consumed i.e.
         */
        public float getProgress() {
            if (start == end)
                return 1.0f;
            else
                return Math.min(1.0f, (pos - start) / (float) (end - start));
        }

        public String makePositionMessage() {
            return file.toString() + ":" + pos;
        }

        protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException {
            // ID line
            long skipped = lineReader.skip(1); // skip @
            pos += skipped;
            if (skipped == 0)
                return false; // EOF

            // ID
            readLineInto(key);
            // sequence
            value.clear();
            readLineInto(value.getSequence());
            readLineInto(buffer);
            if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+')
                throw new RuntimeException("unexpected fastq line separating sequence and quality at "
                        + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key);
            readLineInto(value.getQuality());

            // look for the Illumina-formatted name.  Once it isn't found lookForIlluminaIdentifier will be set to false
            lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value);
            if (!lookForIlluminaIdentifier)
                scanNameForReadNumber(key, value);
            return true;
        }

        /**
         * Reads the next key/value pair from the input for processing.
         */
        public boolean next(Text key, SequencedFragment value) throws IOException {
            if (pos >= end)
                return false; // past end of slice
            try {
                boolean gotData;
                boolean goodRecord;
                do {
                    gotData = lowLevelFastqRead(key, value);
                    goodRecord = gotData
                            && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed());
                } while (gotData && !goodRecord);

                if (goodRecord) // goodRecord falso also when we couldn't read any more data
                {
                    if (qualityEncoding == BaseQualityEncoding.Illumina) {
                        try {
                            // convert illumina to sanger scale
                            SequencedFragment.convertQuality(value.getQuality(), BaseQualityEncoding.Illumina,
                                    BaseQualityEncoding.Sanger);
                        } catch (FormatException e) {
                            throw new FormatException(e.getMessage() + " Position: " + makePositionMessage()
                                    + "; Sequence ID: " + key);
                        }
                    } else // sanger qualities.
                    {
                        int outOfRangeElement = SequencedFragment.verifyQuality(value.getQuality(),
                                BaseQualityEncoding.Sanger);
                        if (outOfRangeElement >= 0) {
                            throw new FormatException(
                                    "fastq base quality score out of range for Sanger Phred+33 format (found "
                                            + (value.getQuality().getBytes()[outOfRangeElement]
                                                    - FormatConstants.SANGER_OFFSET)
                                            + ").\n"
                                            + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"
                                            + "Position: " + makePositionMessage() + "; Sequence ID: " + key);
                        }
                    }
                }
                return goodRecord;
            } catch (EOFException e) {
                throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage()
                        + ".  Id: " + key.toString());
            }
        }

        private void scanNameForReadNumber(Text name, SequencedFragment fragment) {
            // look for a /[0-9] at the end of the name
            if (name.getLength() >= 2) {
                byte[] bytes = name.getBytes();
                int last = name.getLength() - 1;

                if (bytes[last - 1] == '/' && bytes[last] >= '0' && bytes[last] <= '9')
                    fragment.setRead(bytes[last] - '0');
            }
        }

        private boolean scanIlluminaId(Text name, SequencedFragment fragment) {
            Matcher m = ILLUMINA_PATTERN.matcher(name.toString());
            boolean matches = m.matches();
            if (matches) {
                fragment.setInstrument(m.group(1));
                fragment.setRunNumber(Integer.parseInt(m.group(2)));
                fragment.setFlowcellId(m.group(3));
                fragment.setLane(Integer.parseInt(m.group(4)));
                fragment.setTile(Integer.parseInt(m.group(5)));
                fragment.setXpos(Integer.parseInt(m.group(6)));
                fragment.setYpos(Integer.parseInt(m.group(7)));
                fragment.setRead(Integer.parseInt(m.group(8)));
                fragment.setFilterPassed("N".equals(m.group(9)));
                fragment.setControlNumber(Integer.parseInt(m.group(10)));
                fragment.setIndexSequence(m.group(11));
            }
            return matches;
        }

        private int readLineInto(Text dest) throws EOFException, IOException {
            int bytesRead = lineReader.readLine(dest, MAX_LINE_LENGTH);
            if (bytesRead <= 0)
                throw new EOFException();
            pos += bytesRead;
            return bytesRead;
        }
    }

    @Override
    public boolean isSplitable(JobContext context, Path path) {
        CompressionCodec codec = new CompressionCodecFactory(ContextUtil.getConfiguration(context)).getCodec(path);
        return codec == null;
    }

    public RecordReader<Text, SequencedFragment> createRecordReader(InputSplit genericSplit,
            TaskAttemptContext context) throws IOException, InterruptedException {
        context.setStatus(genericSplit.toString());
        return new FastqRecordReader(ContextUtil.getConfiguration(context), (FileSplit) genericSplit); // cast as per example in TextInputFormat
    }
}