com.intel.genomicsdb.GenomicsDBImporter.java Source code

Introduction

Here is the source code for com.intel.genomicsdb.GenomicsDBImporter.java
Source

/*
 * The MIT License (MIT)
 * Copyright (c) 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 * the Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

package com.intel.genomicsdb;

import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.samtools.SAMSequenceDictionary;

//JSON operations
import org.json.simple.parser.JSONParser;
import org.json.simple.JSONObject;
import org.json.simple.JSONArray;
import org.json.simple.parser.ParseException;

import java.io.IOException;
import java.io.OutputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;

/**
 * Java wrapper for vcf2tiledb - imports VCFs into TileDB/GenomicsDB.
 * All vid information is assumed to be set correctly by the user (JSON files)
 */

public class GenomicsDBImporter {
    static {
        try {
            boolean loaded = GenomicsDBUtils.loadLibrary();
            if (!loaded)
                throw new GenomicsDBException("Could not load genomicsdb native library");
        } catch (UnsatisfiedLinkError ule) {
            throw new GenomicsDBException("Could not load genomicsdb native library");
        }
    }
    private static long mDefaultBufferCapacity = 20480; //20KB

    /**
     * Buffer stream implementation - it's silent in the sense that when the buffer is full,
     * it doesn't raise an exception but just marks a a flag as full. It's up to the caller
     * to check the flag and retry later
     * Why? Most likely, it's faster to check a flag rather than throw and catch an exception
     */
    private class SilentByteBufferStream extends OutputStream {
        private byte mBuffer[] = null;
        private long mNumValidBytes = 0;
        private long mMarker = 0;
        private boolean mOverflow = false;

        /**
         * Constructor - uses default value of buffer capacity (20KiB)
         */
        public SilentByteBufferStream() {
            mBuffer = new byte[(int) mDefaultBufferCapacity];
        }

        /**
         * Constructor - uses specified buffer capacity
         * @param capacity size of buffer in bytes
         */
        public SilentByteBufferStream(final long capacity) {
            mBuffer = new byte[(int) capacity];
        }

        @Override
        public void close() throws IOException //does nothing
        {
        }

        @Override
        public void flush() throws IOException //does nothing
        {
        }

        @Override
        public void write(byte[] b, int off, int len) throws IOException {
            if (mOverflow)
                return;
            if (len + mNumValidBytes > mBuffer.length)
                mOverflow = true;
            else {
                System.arraycopy(b, off, mBuffer, (int) mNumValidBytes, len);
                mNumValidBytes += len;
            }
        }

        @Override
        public void write(byte[] b) throws IOException {
            write(b, 0, b.length);
        }

        @Override
        public void write(int b) throws IOException {
            if (mOverflow)
                return;
            if (mNumValidBytes + 1 > mBuffer.length)
                mOverflow = true;
            else {
                mBuffer[(int) mNumValidBytes] = (byte) b;
                ++mNumValidBytes;
            }
        }

        /**
         * Returns buffer capacity in bytes
         * @return buffer capacity in bytes
         */
        public int size() {
            return mBuffer.length;
        }

        /**
         * Resizes buffer to new size - data is retained
         * @param newSize new capacity of the buffer
         */
        public void resize(final long newSize) {
            byte tmp[] = new byte[(int) newSize];
            System.arraycopy(mBuffer, 0, tmp, 0, mBuffer.length);
            mBuffer = tmp; //hopefully Java GC does its job
        }

        /**
         * Returns if the buffer has overflowed
         * @return true if the buffer has overflowed
         */
        public boolean overflow() {
            return mOverflow;
        }

        /**
         * Set overflow value
         * @param value overflow value
         */
        public void setOverflow(final boolean value) {
            mOverflow = value;
        }

        /**
         * Get number of valid bytes
         * @return number of valid bytes
         */
        public long getNumValidBytes() {
            return mNumValidBytes;
        }

        /**
         * Set number of valid bytes
         * @param value number of valid bytes
         */
        public void setNumValidBytes(final long value) {
            mNumValidBytes = value;
        }

        /**
         * Caller code can use this function to mark a certain point in the buffer
         * This is generally used to mark the position in the buffer after the last
         * complete VariantContext object written
         * @param value set marker value
         */
        public void setMarker(final long value) {
            mMarker = value;
        }

        /**
         * Get marker value
         * @return marker value
         */
        public long getMarker() {
            return mMarker;
        }

        /**
         * Get byte buffer for this stream
         * @return byte buffer for this stream
         */
        public byte[] getBuffer() {
            return mBuffer;
        }
    }

    /**
     * Utility class wrapping a stream and a VariantContextWriter for a given stream
     * Each GenomicsDB import stream consists of a buffer stream and a writer object
     * If the caller provides an iterator, then mCurrentVC points to the VariantContext object
     * to be written, if any.
     */
    private class GenomicsDBImporterStreamWrapper {
        public VariantContextWriter mVCWriter = null;
        public SilentByteBufferStream mStream = null;
        private Iterator<VariantContext> mIterator = null;
        private VariantContext mCurrentVC = null;

        /**
         * Constructor
         * @param vcfHeader VCF header for the stream
         * @param bufferCapacity Capacity of the stream buffer in bytes
         * @param streamType BCF_STREAM or VCF_STREAM
         * @param vcIterator iterator over VariantContext objects, can be null if the caller is managing
         * the buffer explicitly
         */
        public GenomicsDBImporterStreamWrapper(final VCFHeader vcfHeader, final long bufferCapacity,
                final VariantContextWriterBuilder.OutputType streamType, Iterator<VariantContext> vcIterator)
                throws GenomicsDBException {
            mIterator = vcIterator;
            if (vcIterator != null && vcIterator.hasNext())
                mCurrentVC = vcIterator.next();
            boolean headerWritten = false;
            long currentCapacity = bufferCapacity;
            //Must ensure that the header gets written into the buffer stream
            //Why this big outer loop? VCFWriter/BCFWriter seems to store some state which makes
            //calling writeHeader() multiple times impossible
            //Hence, create new objects in every iteration of the loop
            //Since this function is called only once per stream, not really
            //a performance concern
            while (!headerWritten) {
                mStream = new SilentByteBufferStream(currentCapacity);
                switch (streamType) {
                case BCF_STREAM:
                    mVCWriter = new VariantContextWriterBuilder().setOutputBCFStream(mStream)
                            .unsetOption(Options.INDEX_ON_THE_FLY).build();
                    break;
                case VCF_STREAM:
                    mVCWriter = new VariantContextWriterBuilder().setOutputVCFStream(mStream)
                            .unsetOption(Options.INDEX_ON_THE_FLY).build();
                    break;
                default:
                    throw new GenomicsDBException("Unknown stream type " + streamType.toString());
                }
                //Why clone the header?
                //The writer modifies the VCFHeader object passed to writeHeader() - however,
                //we might need to call writeHeader multiple times if the underlying buffer
                //in mStream is too small. Hence, always pass a clone of the original,
                //unmodified header in each call of writeHeader
                mVCWriter.writeHeader(new VCFHeader(vcfHeader));
                if (mStream.overflow())
                    currentCapacity = 2 * currentCapacity + 1;
                else
                    headerWritten = true;
            }
        }

        /**
         * Returns true if a non-null Iterator over VariantContext objects was provided for this stream
         * @return true if a non-null Iterator over VariantContext objects was provided for this stream
         */
        public boolean hasIterator() {
            return (mIterator != null);
        }

        /**
         * Returns the next VariantContext object iff the Iterator over VariantContext objects
         * is non-null and has a next() object,
         * else returns null. Stores the result in mCurrentVC
         * @return the next VariantContext object or null
         */
        public VariantContext next() {
            if (mIterator != null && mIterator.hasNext())
                mCurrentVC = mIterator.next();
            else
                mCurrentVC = null;
            return mCurrentVC;
        }

        /**
         * Returns mCurrentVC - could be null if mIterator is null or !mIterator.hasNext()
         * @return VariantContext object to be written
         */
        public VariantContext getCurrentVC() {
            return mCurrentVC;
        }
    }

    /**
     * Utility class that stores row index and globally unique name for a given sample
     */
    public static class SampleInfo {
        public String mName = null;
        public long mRowIdx = -1;

        public SampleInfo(final String name, final long rowIdx) {
            mName = name;
            mRowIdx = rowIdx;
        }
    }

    /**
     * Utility class to represent a chromosome interval
     * Contains 3 members - chr name, start, end (1-based)
     */
    public static class ChromosomeInterval {
        public String mChromosomeName = null;
        public long mBegin = -1;
        public long mEnd = -1;

        public ChromosomeInterval(final String name, final long begin, final long end) {
            mChromosomeName = name;
            mBegin = begin;
            mEnd = end;
        }
    }

    /**
     * Given an AbstractFeatureReader over VariantContext objects
     * and a list of ChromosomeInterval objects, this class 
     * is an Iterator over VariantContext for all the chromosome intervals in the
     * list
     * @param <SOURCE> LineIterator for VCFs, PositionalBufferedStream for BCFs
     */
    public static class MultiChromosomeIterator<SOURCE> implements Iterator<VariantContext> {
        private ArrayList<ChromosomeInterval> mChromosomeIntervals = null;
        private AbstractFeatureReader<VariantContext, SOURCE> mReader = null;
        private int mIdxInIntervalList = 0;
        private CloseableTribbleIterator<VariantContext> mIterator = null;
        private VCFHeader mHeader = null;

        /**
         * Constructor
         * @param reader AbstractFeatureReader over VariantContext objects - SOURCE can vary - BCF v/s VCF for example
         * @param chromosomeIntervals chromosome intervals over which to iterate
         * @throws IOException when the reader's query method throws an IOException
         */
        public MultiChromosomeIterator(AbstractFeatureReader<VariantContext, SOURCE> reader,
                final List<ChromosomeInterval> chromosomeIntervals) throws IOException {
            mReader = reader;
            mHeader = (VCFHeader) (reader.getHeader());
            mChromosomeIntervals = new ArrayList<ChromosomeInterval>();
            //Only add intervals whose chromosomes are present in the VCF header
            final SAMSequenceDictionary contigDictionary = mHeader.getSequenceDictionary();
            for (ChromosomeInterval currInterval : chromosomeIntervals)
                if (contigDictionary.getSequenceIndex(currInterval.mChromosomeName) != -1)
                    mChromosomeIntervals.add(currInterval);
            if (mChromosomeIntervals.size() > 0) {
                ChromosomeInterval currInterval = mChromosomeIntervals.get(0);
                mIterator = mReader.query(currInterval.mChromosomeName, (int) currInterval.mBegin,
                        (int) currInterval.mEnd);
            }
        }

        @Override
        public boolean hasNext() {
            if (mIterator == null)
                return false;
            return mIterator.hasNext();
        }

        @Override
        public VariantContext next() throws NoSuchElementException {
            try {
                if (mIterator == null)
                    throw new NoSuchElementException("next() called for iterator with no more elements");
                VariantContext returnValue = mIterator.next();
                //within the same chromosome
                if (mIterator.hasNext())
                    return returnValue;
                //move to next chromosome and iterate
                //It's possible that the reader has no record for the next contig, but could have records
                //for subsequent contigs
                for (mIdxInIntervalList = mIdxInIntervalList + 1; mIdxInIntervalList < mChromosomeIntervals
                        .size(); ++mIdxInIntervalList) {
                    ChromosomeInterval currInterval = mChromosomeIntervals.get(mIdxInIntervalList);
                    mIterator = mReader.query(currInterval.mChromosomeName, (int) currInterval.mBegin,
                            (int) currInterval.mEnd);
                    if (mIterator.hasNext())
                        return returnValue;
                }
                mIterator = null;
                return returnValue;
            } catch (IOException e) {
                throw new NoSuchElementException("Caught IOException: " + e.getMessage());
            }
        }
    }

    /**
     * JNI functions
     */
    /**
     * Creates GenomicsDBImporter object when importing VCF files (no streams)
     * @param loaderJSONFile Path to loader JSON file
     * @param rank Rank of object - corresponds to the partition index in the loader
     *             for which this object will import data
     * @param lbRowIdx Smallest row idx which should be imported by this object
     * @param ubRowIdx Largest row idx which should be imported by this object
     * @return status - 0 if everything was ok, -1 otherwise
     */
    private native int jniGenomicsDBImporter(String loaderJSONFile, int rank, long lbRowIdx, long ubRowIdx);

    /**
     * Creates GenomicsDBImporter object when importing VCF files (no streams)
     * @param loaderJSONFile Path to loader JSON file
     * @param rank Rank of object - corresponds to the partition index in the
     *             loader for which this object will import data
     * @param lbRowIdx Smallest row idx which should be imported by this object
     * @param ubRowIdx Largest row idx which should be imported by this object
     * @return "pointer"/address to GenomicsDBImporter object in memory,
     *         if 0, then something went wrong
     */
    private native long jniInitializeGenomicsDBImporterObject(String loaderJSONFile, int rank, long lbRowIdx,
            long ubRowIdx);

    /**
     * Notify importer object that a new stream is to be added
     * @param genomicsDBImporterHandle "pointer" returned by jniInitializeGenomicsDBImporterObject
     * @param streamName name of the stream
     * @param isBCF use BCF format to pass data to C++ layer
     * @param bufferCapacity in bytes
     * @param buffer initialization buffer containing the VCF/BCF header
     * @param numValidBytesInBuffer num valid bytes in the buffer (length of the header)
     */
    private native void jniAddBufferStream(long genomicsDBImporterHandle, String streamName, boolean isBCF,
            long bufferCapacity, byte[] buffer, long numValidBytesInBuffer);

    /**
     * Setup loader after all the buffer streams are added
     * @param genomicsDBImporterHandle "pointer" returned by jniInitializeGenomicsDBImporterObject
     * @param callsetMappingJSON JSON formatted string containing globally consistent callset
     *                           name to row index mapping
     * @return maximum number of buffer stream identifiers that can be returned in
     *         mExhaustedBufferStreamIdentifiers later
     *         (this depends on the number of partitions and the number of buffer streams)
     */
    private native long jniSetupGenomicsDBLoader(long genomicsDBImporterHandle, final String callsetMappingJSON);

    /**
     * @param handle "pointer" returned by jniInitializeGenomicsDBImporterObject
     * @param streamIdx stream index
     * @param partitionIdx partition index (unused now)
     * @param buffer buffer containing data
     * @param numValidBytesInBuffer num valid bytes in the buffer
     */
    private native void jniWriteDataToBufferStream(long handle, int streamIdx, int partitionIdx, byte[] buffer,
            long numValidBytesInBuffer);

    /**
     * Import the next batch of data into TileDB/GenomicsDB
     * @param genomicsDBImporterHandle "pointer" returned by jniInitializeGenomicsDBImporterObject
     * @param exhaustedBufferIdentifiers contains the list of exhausted buffer stream identifiers
     *                                   - the number of
     * exhausted streams is stored in the last element of the array
     * @return true if the whole import process is completed, false otherwise
     */
    private native boolean jniImportBatch(long genomicsDBImporterHandle, long[] exhaustedBufferIdentifiers);

    /**
     * Obtain the chromosome intervals for the column partition specified in the loader JSON file
     * identified by the rank. The information is returned as a string in JSON format
     * {
     *   "contigs": [
     *      { "chr1": [ 100, 200] },
     *      { "chr2": [ 500, 600] }
     *   ]
     * }
     * @param loaderJSONFile path to loader JSON file
     * @param rank rank/partition index
     * @return chromosome intervals for the queried column partition in JSON format
     */
    private static native String jniGetChromosomeIntervalsForColumnPartition(final String loaderJSONFile,
            final int rank);

    private String mLoaderJSONFile = null;
    private int mRank = 0;
    private long mLbRowIdx = 0;
    private long mUbRowIdx = Long.MAX_VALUE - 1;

    //For buffered streams
    private boolean mContainsBufferStreams = false;
    private long mGenomicsDBImporterObjectHandle = 0;
    private ArrayList<GenomicsDBImporterStreamWrapper> mBufferStreamWrapperVector = null;
    private boolean mIsLoaderSetupDone = false;
    //To find out which buffer streams are exhausted
    private long mMaxBufferStreamIdentifiers = 0;
    private long[] mExhaustedBufferStreamIdentifiers = null;
    private long mNumExhaustedBufferStreams = 0;
    //Done flag - useful only for buffered streams
    private boolean mDone = false;
    //JSON object that specifies callset/sample name to row_idx mapping in the buffer
    private JSONObject mCallsetMappingJSON = null;

    /**
     * Constructor
     */
    public GenomicsDBImporter() {
    }

    /**
     * Constructor
     * @param loaderJSONFile GenomicsDB loader JSON configuration file
     */
    public GenomicsDBImporter(String loaderJSONFile) {
        initialize(loaderJSONFile, 0, 0, Long.MAX_VALUE - 1);
    }

    /**
     * Constructor
     * @param loaderJSONFile GenomicsDB loader JSON configuration file
     * @param rank Rank of this process (TileDB/GenomicsDB partition idx)
     */
    public GenomicsDBImporter(String loaderJSONFile, int rank) {
        initialize(loaderJSONFile, rank, 0, Long.MAX_VALUE - 1);
    }

    /**
     * Constructor
     * @param loaderJSONFile GenomicsDB loader JSON configuration file
     * @param rank Rank of this process (TileDB/GenomicsDB partition idx)
     * @param lbRowIdx Smallest row idx which should be imported by this object
     * @param ubRowIdx Largest row idx which should be imported by this object
     */
    public GenomicsDBImporter(String loaderJSONFile, int rank, long lbRowIdx, long ubRowIdx) {
        initialize(loaderJSONFile, rank, lbRowIdx, ubRowIdx);
    }

    /**
     * Initialize variables
     * @param loaderJSONFile GenomicsDB loader JSON configuration file
     * @param rank Rank of this process (TileDB/GenomicsDB partition idx)
     * @param lbRowIdx Smallest row idx which should be imported by this object
     * @param ubRowIdx Largest row idx which should be imported by this object
     */
    private void initialize(String loaderJSONFile, int rank, long lbRowIdx, long ubRowIdx) {
        mLoaderJSONFile = loaderJSONFile;
        mRank = rank;
        mLbRowIdx = lbRowIdx;
        mUbRowIdx = ubRowIdx;
    }

    /**
     * Static function that reads sample names from the vcfHeader and adds entries to the map.
     * The function assumes that the samples will be assigned row indexes beginning at rowIdx
     * and that the sample names specified in the header
     * are globally unique (across all streams/files)
     * @param sampleIndexToInfo  map: key=sampleIndex in vcfHeader: value=SampleInfo
     * @param vcfHeader VCF header
     * @param rowIdx Starting row index from which to assign
     * @return rowIdx+#samples in the header
     */
    public static long initializeSampleInfoMapFromHeader(Map<Integer, SampleInfo> sampleIndexToInfo,
            final VCFHeader vcfHeader, final long rowIdx) {
        final List<String> headerSampleNames = vcfHeader.getGenotypeSamples();
        final int numSamplesInHeader = headerSampleNames.size();
        for (int i = 0; i < numSamplesInHeader; ++i)
            sampleIndexToInfo.put(i, new SampleInfo(headerSampleNames.get(i), rowIdx + i));
        return rowIdx + numSamplesInHeader;
    }

    /**
     * Add a buffer stream as the data source - caller must:
     * 1. Call setupGenomicsDBImporter() once all streams are added
     * 2. Provide VC objects using the add() function
     * 3. Call importBatch()
     * 4. Get list of exhausted buffer streams using getNumExhaustedBufferStreams(),
     *    getExhaustedBufferStreamIndex()
     * 5. If !isDone() goto 2
     * @param streamName Name of the stream being added - must be unique with respect to this
     *                   GenomicsDBImporter object
     * @param vcfHeader VCF header for the stream
     * @param bufferCapacity Capacity of the stream buffer in bytes
     * @param streamType BCF_STREAM or VCF_STREAM
     * @param sampleIndexToInfo map from sample index in the vcfHeader to SampleInfo object which
     *                          contains row index and globally unique name
     * can be set to null, which implies that the mapping is stored in a callsets JSON file
     * @return returns buffer stream index
     */
    public int addBufferStream(final String streamName, final VCFHeader vcfHeader, final long bufferCapacity,
            final VariantContextWriterBuilder.OutputType streamType,
            final Map<Integer, SampleInfo> sampleIndexToInfo) throws GenomicsDBException {
        return addBufferStream(streamName, vcfHeader, bufferCapacity, streamType, null, sampleIndexToInfo);
    }

    /**
     * Add a sorted VC iterator as the data source - caller must:
     * 1. Call setupGenomicsDBImporter() once all iterators are added
     * 2. Call importBatch()
     * 3. Done!
     * @param streamName Name of the stream being added - must be unique with respect
     *                   to this GenomicsDBImporter object
     * @param vcfHeader VCF header for the stream
     * @param vcIterator Iterator over VariantContext objects
     * @param bufferCapacity Capacity of the stream buffer in bytes
     * @param streamType BCF_STREAM or VCF_STREAM
     * @param sampleIndexToInfo map from sample index in the vcfHeader to SampleInfo object
     *                          which contains row index and globally unique name
     *                          can be set to null, which implies that the mapping is
     *                          stored in a callsets JSON file
     * @return returns the stream index
     */
    public int addSortedVariantContextIterator(final String streamName, final VCFHeader vcfHeader,
            Iterator<VariantContext> vcIterator, final long bufferCapacity,
            final VariantContextWriterBuilder.OutputType streamType,
            final Map<Integer, SampleInfo> sampleIndexToInfo) throws GenomicsDBException {
        return addBufferStream(streamName, vcfHeader, bufferCapacity, streamType, vcIterator, sampleIndexToInfo);
    }

    /**
     * Sets sorted VC iterator as the data source and calls setupGenomicsDBImporter().
     * No more streams/iterators can
     * be added after this function is called. The caller must:
     * 1. Call importBatch()
     * 2. Done!
     * @param streamName Name of the stream being added - must be unique with respect to
     *                   this GenomicsDBImporter object
     * @param vcfHeader VCF header for the stream
     * @param vcIterator Iterator over VariantContext objects
     * @param bufferCapacity Capacity of the stream buffer in bytes
     * @param streamType BCF_STREAM or VCF_STREAM
     * @param sampleIndexToInfo map from sample index in the vcfHeader to SampleInfo
     *                          object which contains row index and globally unique name
     *                          can be set to null, which implies that the mapping is stored in a
     *                          callsets JSON file
     * @return returns the stream index
     * @throws GenomicsDBException thrown if incorrect iterator or missing JSON configuration
     * @throws IOException thrown if incorrect iterator or missing JSON configuration
     *                          files
     */
    public int setSortedVariantContextIterator(final String streamName, final VCFHeader vcfHeader,
            Iterator<VariantContext> vcIterator, final long bufferCapacity,
            final VariantContextWriterBuilder.OutputType streamType,
            final Map<Integer, SampleInfo> sampleIndexToInfo) throws GenomicsDBException, IOException {
        int streamIdx = addSortedVariantContextIterator(streamName, vcfHeader, vcIterator, bufferCapacity,
                streamType, sampleIndexToInfo);
        setupGenomicsDBImporter();
        return streamIdx;
    }

    /**
     * Add a buffer stream or VC iterator - internal function
     * @param streamName Name of the stream being added - must be unique with respect to this
     *                   GenomicsDBImporter object
     * @param vcfHeader VCF header for the stream
     * @param bufferCapacity Capacity of the stream buffer in bytes
     * @param streamType BCF_STREAM or VCF_STREAM
     * @param vcIterator Iterator over VariantContext objects - can be null
     * @param sampleIndexToInfo map from sample index in the vcfHeader to SampleInfo object which
     *                          contains row index and globally unique name can be set to null,
     *                          which implies that the mapping is stored in a callsets JSON file
     * @return returns the stream index
     */
    private int addBufferStream(final String streamName, final VCFHeader vcfHeader, final long bufferCapacity,
            final VariantContextWriterBuilder.OutputType streamType, Iterator<VariantContext> vcIterator,
            final Map<Integer, SampleInfo> sampleIndexToInfo) throws GenomicsDBException {
        if (mIsLoaderSetupDone)
            throw new GenomicsDBException(
                    "Cannot add buffer streams after " + "setupGenomicsDBImporter() is called");
        //First time a buffer is added
        if (!mContainsBufferStreams) {
            mGenomicsDBImporterObjectHandle = jniInitializeGenomicsDBImporterObject(mLoaderJSONFile, mRank,
                    mLbRowIdx, mUbRowIdx);
            if (mGenomicsDBImporterObjectHandle == 0)
                throw new GenomicsDBException("Could not initialize GenomicsDBImporter object");
            mBufferStreamWrapperVector = new ArrayList<GenomicsDBImporterStreamWrapper>();
            mCallsetMappingJSON = new JSONObject();
            mContainsBufferStreams = true;
        }
        mBufferStreamWrapperVector
                .add(new GenomicsDBImporterStreamWrapper(vcfHeader, bufferCapacity, streamType, vcIterator));
        int currIdx = mBufferStreamWrapperVector.size() - 1;
        SilentByteBufferStream currStream = mBufferStreamWrapperVector.get(currIdx).mStream;
        jniAddBufferStream(mGenomicsDBImporterObjectHandle, streamName,
                streamType == VariantContextWriterBuilder.OutputType.BCF_STREAM, bufferCapacity,
                currStream.getBuffer(), currStream.getNumValidBytes());
        if (sampleIndexToInfo != null) {
            for (Map.Entry<Integer, SampleInfo> currEntry : sampleIndexToInfo.entrySet()) {
                JSONObject sampleJSON = new JSONObject();
                sampleJSON.put("row_idx", currEntry.getValue().mRowIdx);
                sampleJSON.put("stream_name", streamName);
                sampleJSON.put("idx_in_file", currEntry.getKey());
                mCallsetMappingJSON.put(currEntry.getValue().mName, sampleJSON);
            }
        }
        return currIdx;
    }

    /**
     * Setup the importer after all the buffer streams are added, but before any
     * data is inserted into any stream
     * No more buffer streams can be added once setupGenomicsDBImporter() is called
     * @throws IOException throws IOException if modified callsets JSON cannot be written
     */
    public void setupGenomicsDBImporter() throws IOException {
        if (mIsLoaderSetupDone)
            return;
        //Callset mapping JSON - convert to string
        JSONObject topCallsetJSON = new JSONObject();
        topCallsetJSON.put("callsets", mCallsetMappingJSON);
        StringWriter stringWriter = new StringWriter();
        topCallsetJSON.writeJSONString(stringWriter);
        //Call native setupGenomicsDBImporter()
        mMaxBufferStreamIdentifiers = jniSetupGenomicsDBLoader(mGenomicsDBImporterObjectHandle,
                stringWriter.toString());
        //Why 2* - each identifier is a pair<buffer_stream_idx, partition_idx>
        //Why +1 - the last element will contain the number of exhausted stream identifiers
        //when importBatch() is called
        mExhaustedBufferStreamIdentifiers = new long[2 * ((int) mMaxBufferStreamIdentifiers) + 1];
        //Set all streams to empty
        //Add all streams to mExhaustedBufferStreamIdentifiers - this way when importBatch is
        //called the first time all streams' data are written
        for (int i = 0, idx = 0; i < mBufferStreamWrapperVector.size(); ++i, idx += 2) {
            SilentByteBufferStream currStream = mBufferStreamWrapperVector.get(i).mStream;
            currStream.setNumValidBytes(0);
            mExhaustedBufferStreamIdentifiers[idx] = i;
            mExhaustedBufferStreamIdentifiers[idx + 1] = 0;
        }
        //Set number of exhausted buffer streams - all streams are exhausted the first time
        mNumExhaustedBufferStreams = mBufferStreamWrapperVector.size();
        mExhaustedBufferStreamIdentifiers[(int) (2 * mMaxBufferStreamIdentifiers)] = mNumExhaustedBufferStreams;
        mIsLoaderSetupDone = true;
    }

    /**
     * Write VariantContext object to stream - may fail if the buffer is full
     * It's the caller's responsibility keep track of the VC object that's not written
     * @param vc VariantContext object
     * @param streamIdx index of the stream returned by the addBufferStream() call
     * @return true if the vc object was written successfully, false otherwise
     */
    public boolean add(VariantContext vc, final int streamIdx) throws GenomicsDBException, RuntimeIOException {
        if (streamIdx < 0 || streamIdx >= mBufferStreamWrapperVector.size())
            throw new GenomicsDBException("Invalid stream idx " + Integer.toString(streamIdx)
                    + " must be between [0-" + Long.toString(mBufferStreamWrapperVector.size() - 1) + "]");
        if (!mIsLoaderSetupDone)
            throw new GenomicsDBException(
                    "Cannot add VariantContext objects " + "to streams before calling setupGenomicsDBImporter()");
        GenomicsDBImporterStreamWrapper currWrapper = mBufferStreamWrapperVector.get(streamIdx);
        currWrapper.mVCWriter.add(vc);
        SilentByteBufferStream currStream = currWrapper.mStream;
        //at least one record already existed in the buffer
        if (currStream.overflow() && currStream.getMarker() > 0) {
            //Set num valid bytes to marker - marker points to location
            //after the last valid serialized vc in the buffer
            currStream.setNumValidBytes(currStream.getMarker());
            return false;
        } else {
            //the first record to be added to the buffer is too large, resize buffer
            while (currStream.overflow()) {
                currStream.resize(2 * currStream.size() + 1);
                currStream.setNumValidBytes(0);
                currStream.setOverflow(false);
                currWrapper.mVCWriter.add(vc);
            }
            //Update marker - marker points to location after the last valid serialized vc in the buffer
            currStream.setMarker(currStream.getNumValidBytes());
            return true;
        }
    }

    /**
     * @return true if the import process is done
     * @throws IOException if the wimport fails
     */
    public boolean importBatch() throws IOException {
        if (mDone)
            return true;
        if (!mIsLoaderSetupDone)
            setupGenomicsDBImporter();
        boolean allExhaustedStreamsHaveIterators = true;
        while (!mDone && allExhaustedStreamsHaveIterators) {
            //Write data from buffer streams exhausted in the previous round into GenomicsDB
            for (int i = 0, idx = 0; i < mNumExhaustedBufferStreams; ++i, idx += 2) {
                int bufferStreamIdx = (int) mExhaustedBufferStreamIdentifiers[idx];
                GenomicsDBImporterStreamWrapper currWrapper = mBufferStreamWrapperVector.get(bufferStreamIdx);
                //If iterator is provided, get data from iterator
                if (currWrapper.hasIterator()) {
                    while (currWrapper.getCurrentVC() != null) {
                        boolean added = add(currWrapper.getCurrentVC(), bufferStreamIdx);
                        if (added)
                            currWrapper.next();
                        else
                            break; //buffer full
                    }
                }
                SilentByteBufferStream currStream = currWrapper.mStream;
                jniWriteDataToBufferStream(mGenomicsDBImporterObjectHandle, bufferStreamIdx, 0,
                        currStream.getBuffer(), currStream.getNumValidBytes());
            }
            mDone = jniImportBatch(mGenomicsDBImporterObjectHandle, mExhaustedBufferStreamIdentifiers);
            mNumExhaustedBufferStreams = mExhaustedBufferStreamIdentifiers[mExhaustedBufferStreamIdentifiers.length
                    - 1];
            //Reset markers, numValidBytesInBuffer and overflow flag for the exhausted streams
            for (long i = 0, idx = 0; i < mNumExhaustedBufferStreams; ++i, idx += 2) {
                int bufferStreamIdx = (int) mExhaustedBufferStreamIdentifiers[(int) idx];
                GenomicsDBImporterStreamWrapper currWrapper = mBufferStreamWrapperVector.get(bufferStreamIdx);
                if (!currWrapper.hasIterator())
                    allExhaustedStreamsHaveIterators = false;
                SilentByteBufferStream currStream = currWrapper.mStream;
                currStream.setOverflow(false);
                currStream.setMarker(0);
                currStream.setNumValidBytes(0);
            }
            if (mDone) {
                mGenomicsDBImporterObjectHandle = 0;
                mContainsBufferStreams = false;
                mIsLoaderSetupDone = false;
            }
        }
        return mDone;
    }

    /**
     * @return get number of buffer streams for which new data must be supplied
     */
    public long getNumExhaustedBufferStreams() {
        return mNumExhaustedBufferStreams;
    }

    /**
     * Get buffer stream index of i-th exhausted stream
     * There are mNumExhaustedBufferStreams and the caller must provide data for streams
     * with indexes getExhaustedBufferStreamIndex(0), getExhaustedBufferStreamIndex(1),...,
     * getExhaustedBufferStreamIndex(mNumExhaustedBufferStreams-1)
     * @param i i-th exhausted buffer stream
     * @return the buffer stream index of the i-th exhausted stream
     */
    public int getExhaustedBufferStreamIndex(final long i) {
        assert i < mNumExhaustedBufferStreams && i >= 0;
        //why 2* - exhausted buffer stream identifier is a pair<stream_idx, partition_idx>
        return (int) mExhaustedBufferStreamIdentifiers[2 * ((int) i)];
    }

    /**
     * Is the import process completed
     * @return true if complete, false otherwise
     */
    public boolean isDone() {
        return mDone;
    }

    /**
     * Utility function that returns a list of ChromosomeInterval objects for
     * the column partition specified by the loader JSON file and rank/partition index
     * @param loaderJSONFile path to loader JSON file
     * @param partitionIdx rank/partition index
     * @return list of ChromosomeInterval objects for the specified partition 
     * @throws ParseException when there is a bug in the JNI interface and a faulty JSON is returned
     */
    public static ArrayList<ChromosomeInterval> getChromosomeIntervalsForColumnPartition(
            final String loaderJSONFile, final int partitionIdx) throws ParseException {
        final String chromosomeIntervalsJSONString = jniGetChromosomeIntervalsForColumnPartition(loaderJSONFile,
                partitionIdx);
        /* JSON format
          {
            "contigs": [
               { "chr1": [ 100, 200] },
               { "chr2": [ 500, 600] }
            ]
          }
        */
        ArrayList<ChromosomeInterval> chromosomeIntervals = new ArrayList<ChromosomeInterval>();
        JSONParser parser = new JSONParser();
        JSONObject topObj = (JSONObject) (parser.parse(chromosomeIntervalsJSONString));
        assert topObj.containsKey("contigs");
        JSONArray listOfDictionaries = (JSONArray) (topObj.get("contigs"));
        for (Object currDictObj : listOfDictionaries) {
            JSONObject currDict = (JSONObject) currDictObj;
            assert currDict.size() == 1; //1 entry
            for (Object currEntryObj : currDict.entrySet()) {
                Map.Entry<String, JSONArray> currEntry = (Map.Entry<String, JSONArray>) currEntryObj;
                JSONArray currValue = currEntry.getValue();
                assert currValue.size() == 2;
                chromosomeIntervals.add(new ChromosomeInterval(currEntry.getKey(), (Long) (currValue.get(0)),
                        (Long) (currValue.get(1))));
            }
        }
        return chromosomeIntervals;
    }

    /**
     * Utility function that returns a MultiChromosomeIterator given an AbstractFeatureReader
     * that will iterate over the VariantContext objects provided by the reader belonging
     * to the column partition specified by the loader JSON file and rank/partition index
     * @param <SOURCE> LineIterator for VCFs, PositionalBufferedStream for BCFs
     * @param reader AbstractFeatureReader over VariantContext objects - SOURCE can vary - BCF v/s VCF for example
     * @param loaderJSONFile path to loader JSON file
     * @param partitionIdx rank/partition index
     * @return MultiChromosomeIterator that iterates over VariantContext objects in the reader
     *         belonging to the specified column partition
     * @throws IOException when the reader's query method throws an IOException
     * @throws ParseException when there is a bug in the JNI interface and a faulty JSON is returned
     */
    public static <SOURCE> MultiChromosomeIterator<SOURCE> columnPartitionIterator(
            AbstractFeatureReader<VariantContext, SOURCE> reader, final String loaderJSONFile,
            final int partitionIdx) throws ParseException, IOException {
        return new MultiChromosomeIterator<SOURCE>(reader,
                GenomicsDBImporter.getChromosomeIntervalsForColumnPartition(loaderJSONFile, partitionIdx));
    }

    /**
     * Utility function that returns a MultiChromosomeIterator given an AbstractFeatureReader
     * that will iterate over the VariantContext objects provided by the reader belonging
     * to the column partition specified by this object's loader JSON file and rank/partition index
     * @param <SOURCE> LineIterator for VCFs, PositionalBufferedStream for BCFs
     * @param reader AbstractFeatureReader over VariantContext objects - SOURCE can vary - BCF v/s VCF for example
     * @return MultiChromosomeIterator that iterates over VariantContext objects in the reader
     *         belonging to the specified column partition
     * @throws IOException when the reader's query method throws an IOException
     * @throws ParseException when there is a bug in the JNI interface and a faulty JSON is returned
     */
    public <SOURCE> MultiChromosomeIterator<SOURCE> columnPartitionIterator(
            AbstractFeatureReader<VariantContext, SOURCE> reader) throws ParseException, IOException {
        return GenomicsDBImporter.columnPartitionIterator(reader, mLoaderJSONFile, mRank);
    }

    /**
     * Write to TileDB/GenomicsDB using the configuration specified in the
     * loader file passed to constructor
     */
    public void write() throws GenomicsDBException {
        write(mLoaderJSONFile, mRank, 0, Long.MAX_VALUE - 1);
    }

    /**
     * Write to TileDB/GenomicsDB using the configuration specified in the
     * loader file passed to constructor
     * @param lbRowIdx Minimum row idx from which new data will be added
     */
    public void write(long lbRowIdx) throws GenomicsDBException {
        write(mLoaderJSONFile, mRank, lbRowIdx, Long.MAX_VALUE - 1);
    }

    /**
     * Write to TileDB/GenomicsDB using the configuration specified in the
     * loader file passed to constructor
     * @param rank Rank of this process (TileDB/GenomicsDB partition idx)
     * @param lbRowIdx Minimum row idx from which new data will be added
     */
    public void write(int rank, long lbRowIdx) throws GenomicsDBException {
        write(mLoaderJSONFile, rank, lbRowIdx, Long.MAX_VALUE - 1);
    }

    /**
     * Write to TileDB/GenomicsDB using the configuration specified in the
     * loader file passed to constructor
     * @param rank Rank of this process (TileDB/GenomicsDB partition idx)
     * @param lbRowIdx Minimum row idx from which new data will be added
     * @param ubRowIdx Maximum row idx upto which new data will be added
     */
    public void write(int rank, long lbRowIdx, long ubRowIdx) throws GenomicsDBException {
        write(mLoaderJSONFile, rank, lbRowIdx, ubRowIdx);
    }

    /**
     * Write to TileDB/GenomicsDB
     * @param loaderJSONFile GenomicsDB loader JSON configuration file
     * @param rank Rank of this process (TileDB/GenomicsDB partition idx)
     * @param lbRowIdx Minimum row idx from which new data will be added
     * @param ubRowIdx Maximum row idx upto which new data will be added
     */
    public void write(String loaderJSONFile, int rank, long lbRowIdx, long ubRowIdx) throws GenomicsDBException {
        mDone = false;
        if (loaderJSONFile == null)
            throw new GenomicsDBException("Loader JSON file not specified");
        if (mContainsBufferStreams)
            throw new GenomicsDBException("Cannot call write() functions if buffer streams are added");
        int status = jniGenomicsDBImporter(loaderJSONFile, rank, lbRowIdx, ubRowIdx);
        if (status != 0)
            throw new GenomicsDBException(
                    "GenomicsDBImporter write failed for loader JSON: " + loaderJSONFile + " rank: " + rank);
        mDone = true;
    }
}