com.google.cloud.genomics.dataflow.readers.bam.Reader.java Source code

Introduction

Here is the source code for com.google.cloud.genomics.dataflow.readers.bam.Reader.java
Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.readers.bam;

import com.google.api.services.storage.Storage;
import com.google.api.services.storage.Storage.Objects;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.genomics.utils.Contig;
import com.google.cloud.genomics.utils.grpc.ReadUtils;
import com.google.common.base.Stopwatch;
import com.google.genomics.v1.Read;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;

/**
 * Get all the reads specified by a given shard from a given BAM file.
 * Meant to be called from the DoFn processing the shard.
 */
public class Reader {
    private static final Logger LOG = Logger.getLogger(Reader.class.getName());

    Storage.Objects storageClient;
    BAMShard shard;
    DoFn<BAMShard, Read>.ProcessContext c;
    Stopwatch timer;
    ReaderOptions options;

    SAMRecordIterator iterator;

    enum Filter {
        UNMAPPED_ONLY, MAPPED_AND_UNMAPPED, MAPPED_ONLY
    }

    Filter filter;

    public int recordsBeforeStart = 0;
    public int recordsAfterEnd = 0;
    public int mismatchedSequence = 0;
    public int recordsProcessed = 0;
    public int readsGenerated = 0;

    public Reader(Objects storageClient, ReaderOptions options, BAMShard shard,
            DoFn<BAMShard, Read>.ProcessContext c) {
        super();
        this.storageClient = storageClient;
        this.shard = shard;
        this.c = c;
        this.options = options;
        filter = setupFilter(options, shard.contig.referenceName);
    }

    public static Filter setupFilter(ReaderOptions options, String referenceName) {
        /*
         * The common way of asking for unmapped reads is by specifying asterisk
         * as the sequence name.
         * From SAM format, section 1.4. paragraph 3:
         * "RNAME: Reference sequence NAME of the alignment.
         * If @SQ header lines are present, RNAME (if not *) must be present in
         * one of the SQ-SN tag.
         * An unmapped segment without coordinate has a * at this field."
         * https://samtools.github.io/hts-specs/SAMv1.pdf
         */
        if (referenceName.equals("*")) {
            return Filter.UNMAPPED_ONLY;
        } else if (options.getIncludeUnmappedReads()) {
            return Filter.MAPPED_AND_UNMAPPED;
        }
        return Filter.MAPPED_ONLY;
    }

    public void process() throws IOException {
        timer = Stopwatch.createStarted();
        openFile();

        while (iterator.hasNext()) {
            processRecord(iterator.next());
        }

        dumpStats();
    }

    void openFile() throws IOException {
        LOG.info("Processing shard " + shard);
        final SamReader reader = BAMIO.openBAM(storageClient, shard.file, options.getStringency());
        iterator = null;
        if (reader.hasIndex() && reader.indexing() != null) {
            if (filter == Filter.UNMAPPED_ONLY) {
                LOG.info("Processing unmapped");
                iterator = reader.queryUnmapped();
            } else if (shard.span != null) {
                LOG.info("Processing span for " + shard.contig);
                iterator = reader.indexing().iterator(shard.span);
            } else if (shard.contig.referenceName != null && !shard.contig.referenceName.isEmpty()) {
                LOG.info("Processing all bases for " + shard.contig);
                iterator = reader.query(shard.contig.referenceName, (int) shard.contig.start,
                        (int) shard.contig.end, false);
            }
        }
        if (iterator == null) {
            LOG.info("Processing all reads");
            iterator = reader.iterator();
        }
    }

    /**
     * Checks if the record matches our filter.
     */
    static boolean passesFilter(SAMRecord record, Filter filter, String referenceName) {
        // If we are looking for only mapped or only unmapped reads then we will use
        // the UnmappedFlag to decide if this read should be rejected.
        if (filter == Filter.UNMAPPED_ONLY && !record.getReadUnmappedFlag()) {
            return false;
        }

        if (filter == Filter.MAPPED_ONLY && record.getReadUnmappedFlag()) {
            return false;
        }

        // If we are looking for mapped reads, then we check the reference name
        // of the read matches the one we are looking for.
        final boolean referenceNameMismatch = referenceName != null && !referenceName.isEmpty()
                && !referenceName.equals(record.getReferenceName());

        // Note that unmapped mate pair of mapped read will have a reference
        // name set to the reference of its mapped mate.
        if ((filter == Filter.MAPPED_ONLY || filter == Filter.MAPPED_AND_UNMAPPED) && referenceNameMismatch) {
            return false;
        }

        return true;
    }

    boolean passesFilter(SAMRecord record) {
        return passesFilter(record, filter, shard.contig.referenceName);
    }

    void processRecord(SAMRecord record) {
        recordsProcessed++;
        if (!passesFilter(record)) {
            mismatchedSequence++;
            return;
        }
        if (record.getAlignmentStart() < shard.contig.start) {
            recordsBeforeStart++;
            return;
        }
        if (record.getAlignmentStart() > shard.contig.end) {
            recordsAfterEnd++;
            return;
        }
        c.output(ReadUtils.makeReadGrpc(record));
        readsGenerated++;
    }

    void dumpStats() {
        timer.stop();
        long elapsed = timer.elapsed(TimeUnit.MILLISECONDS);
        if (elapsed == 0)
            elapsed = 1;
        LOG.info("Processed " + recordsProcessed + " outputted " + readsGenerated + " in " + timer + ". Speed: "
                + (recordsProcessed * 1000) / elapsed + " reads/sec" + ", filtered out by reference and mapping "
                + mismatchedSequence + ", skippedBefore " + recordsBeforeStart + ", skipped after "
                + recordsAfterEnd);
    }

    /**
     * To compare how sharded reading works vs. plain HTSJDK sequential iteration,
     * this method implements such iteration.
     * This makes it easier to discover errors such as reads that are somehow
     * skipped by a sharded approach.
     */
    public static Iterable<Read> readSequentiallyForTesting(Objects storageClient, String storagePath,
            Contig contig, ReaderOptions options) throws IOException {
        Stopwatch timer = Stopwatch.createStarted();
        SamReader samReader = BAMIO.openBAM(storageClient, storagePath, options.getStringency());
        SAMRecordIterator iterator = samReader.queryOverlapping(contig.referenceName, (int) contig.start + 1,
                (int) contig.end);
        List<Read> reads = new ArrayList<Read>();

        int recordsBeforeStart = 0;
        int recordsAfterEnd = 0;
        int mismatchedSequence = 0;
        int recordsProcessed = 0;
        Filter filter = setupFilter(options, contig.referenceName);
        while (iterator.hasNext()) {
            SAMRecord record = iterator.next();
            final boolean passesFilter = passesFilter(record, filter, contig.referenceName);

            if (!passesFilter) {
                mismatchedSequence++;
                continue;
            }
            if (record.getAlignmentStart() < contig.start) {
                recordsBeforeStart++;
                continue;
            }
            if (record.getAlignmentStart() > contig.end) {
                recordsAfterEnd++;
                continue;
            }
            reads.add(ReadUtils.makeReadGrpc(record));
            recordsProcessed++;
        }
        timer.stop();
        LOG.info("NON SHARDED: Processed " + recordsProcessed + " in " + timer + ". Speed: "
                + (recordsProcessed * 1000) / timer.elapsed(TimeUnit.MILLISECONDS) + " reads/sec"
                + ", skipped other sequences " + mismatchedSequence + ", skippedBefore " + recordsBeforeStart
                + ", skipped after " + recordsAfterEnd);
        return reads;
    }
}