com.google.cloud.genomics.dataflow.readers.bam.BAMShard.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.genomics.dataflow.readers.bam.BAMShard.java

Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.readers.bam;

import com.google.cloud.genomics.utils.Contig;
import com.google.common.collect.Lists;

import htsjdk.samtools.BAMFileIndexImpl;
import htsjdk.samtools.Chunk;
import htsjdk.samtools.SAMFileSpanImpl;

import java.io.Serializable;
import java.util.List;

/**
 * A shard of BAM data we will create during sharding and then use to drive the reading.
 * We use this class during shard generation, by iteratively building
 * a shard, extending it bin by bin (@see #addBin)
 * At the end of the process, the shard is finalized (@see #finalize)
 * and SAMFileSpan that has all the chunks we want to read is produced.
 */
public class BAMShard implements Serializable {
    public String file;
    public SAMFileSpanImpl span;
    public Contig contig;
    public List<Chunk> chunks;
    public long cachedSizeInBytes = -1;

    /**
     * Begins a new shard with an empty chunk list and a starting locus.
     */
    public BAMShard(String file, String referenceName, long firstLocus) {
        this.file = file;
        this.contig = new Contig(referenceName, firstLocus, -1);
        this.chunks = Lists.newLinkedList();
        this.span = null;
    }

    /**
     * Creates a shard with a known file span.
     * Such shard is not expected to be extended and calling addBin or finalize on it will fail.
     * This constructor is used for "degenerate" shards like unmapped reads or
     * all reads in cases where we don't have an index.
     */
    public BAMShard(String file, SAMFileSpanImpl span, Contig contig) {
        this.file = file;
        this.span = span;
        this.contig = contig;
        this.chunks = null;
    }

    /**
     * Appends chunks from another bin to the list and moved the end position.
     */
    public void addBin(List<Chunk> chunksToAdd, long lastLocus) {
        assert chunks != null;
        contig = new Contig(contig.referenceName, contig.start, lastLocus);
        chunks.addAll(chunksToAdd);
        updateSpan();
    }

    /**
     * Generates a final list of chunks, now that we know the exact bounding loci
     * for this shard. We get all chunks overlapping this loci, and then ask the index
     * for the chunks overlapping them.
     */
    public BAMShard finalize(BAMFileIndexImpl index, long lastLocus) {
        contig = new Contig(contig.referenceName, contig.start, lastLocus);
        this.chunks = index.getChunksOverlapping(contig.referenceName, (int) contig.start, (int) contig.end);
        updateSpan();
        return this;
    }

    /**
     * Updates the underlying file span by optimizing and coalescing the current chunk list.
     */
    private void updateSpan() {
        span = new SAMFileSpanImpl(Chunk.optimizeChunkList(chunks, this.contig.start));
        cachedSizeInBytes = -1;
    }

    public long sizeInLoci() {
        return contig.end > 0 ? contig.end - contig.start : 0;
    }

    public long approximateSizeInBytes() {
        if (cachedSizeInBytes < 0) {
            cachedSizeInBytes = span.approximateSizeInBytes();
        }
        return cachedSizeInBytes;
    }

    @Override
    public String toString() {
        String str = file + ": " + contig.toString() + ", locus size = " + sizeInLoci();
        if (cachedSizeInBytes >= 0) {
            // Only return this as part of the string if it's already cached and calculated.
            // Otherwise calling this function causes the object to be mutated.
            str += ", span size = " + cachedSizeInBytes;
        }
        return str;
    }
}