com.google.cloud.genomics.cba.GGAnnotateVariants.java Source code

Introduction

Here is the source code for com.google.cloud.genomics.cba.GGAnnotateVariants.java
Source

package com.google.cloud.genomics.cba;

/*
 * Copyright (C) 2016-2018 Google Inc. and Stanford University.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 * The code was derived from https://github.com/googlegenomics/dataflow-java/blob/master/src/main/java/com/google/cloud/genomics/dataflow/pipelines/AnnotateVariants.java
 *
 */

import com.google.api.client.util.Strings;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.api.services.genomics.Genomics;
import com.google.api.services.genomics.model.Annotation;
import com.google.api.services.genomics.model.AnnotationSet;
import com.google.api.services.genomics.model.ListBasesResponse;
import com.google.api.services.genomics.model.SearchAnnotationsRequest;
import com.google.api.services.genomics.model.VariantAnnotation;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.KV;

import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.utils.CallSetNamesOptions;
import com.google.cloud.genomics.dataflow.utils.GCSOutputOptions;
import com.google.cloud.genomics.dataflow.utils.GenomicsOptions;
import com.google.cloud.genomics.dataflow.utils.ShardOptions;
import com.google.cloud.genomics.utils.GenomicsFactory;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.cloud.genomics.utils.Paginator;
import com.google.cloud.genomics.utils.ShardBoundary;
import com.google.cloud.genomics.utils.ShardUtils;
import com.google.cloud.genomics.utils.grpc.VariantStreamIterator;
import com.google.cloud.genomics.utils.grpc.VariantUtils;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Range;
import com.google.genomics.v1.StreamVariantsRequest;
import com.google.genomics.v1.StreamVariantsResponse;
import com.google.genomics.v1.Variant;

import htsjdk.samtools.util.IntervalTree;
import htsjdk.samtools.util.IntervalTree.Node;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;

/**
 * <h1>Annotating Google Genomics Variant Sets</h1> This class an annotationSet,
 * and imports annotations. It prepares and converts Annotation fields to the
 * corresponding fields in the Google Genomics annotation set APIs.
 * 
 * @param variantSetId
 *            The ID of the Google Genomics variant set this pipeline is
 *            accessing.
 * @param callSetNames
 *            This provides the name for the AnnotationSet.
 * @param transcriptSetIds
 *            The IDs of the Google Genomics transcript sets.
 * @param variantAnnotationSetIds
 *            The IDs of the Google Genomics variant annotation sets.
 * @param supportChrM
 *            Some annotation reference datasets do not provide information
 *            about ChrM; therefore, before searching for chrM, user must set
 *            this variable TRUE. NOTE: All annotation sets in the query must
 *            support chrM. Otherwise, the query will be failed due to
 *            "error 404: Not Found". Default value is FALSE.
 * @param onlySNP
 *            By setting this value to TRUE, the software only annotate SNPs.
 *            Default value is FALSE.
 * @param printVCFInfo
 *            By setting this value to TRUE, the software prints out Info
 *            section of the VCF file. Default value is FALSE.
 * 
 * @version 1.0
 * @since 2016-07-01
 */

final class GGAnnotateVariants extends DoFn<StreamVariantsRequest, KV<String, String>> {

    private static final long serialVersionUID = -8100934752121655658L;
    public static boolean DEBUG = false;
    private static HashMap<String, Integer> ColInfoTranscript = new HashMap<String, Integer>();
    private static HashMap<String, Integer> ColInfoVariantAnnotation = new HashMap<String, Integer>();
    // private static String HEADER="#referenceName start end referenceBases
    // alternateBases info genotype";
    private static String HEADER = "#referenceName   start   end   referenceBases   alternateBases";

    public static interface Options extends
            //         // Options for call set names.
            CallSetNamesOptions,
            //         // Options for calculating over regions, chromosomes, or whole
            //         // genomes.
            ShardOptions,
            //         // Options for the output destination.
            GCSOutputOptions {

        @Description("The ID of the Google Genomics variant set this pipeline is accessing.")
        String getVariantSetId();

        @Description("The names of the Google Genomics call sets this pipeline is working with, comma "
                + "delimited.")
        String getCallSetNames();

        @Description("The IDs of the Google Genomics transcript sets this pipeline is working with, "
                + "comma delimited.")
        @Default.String("")
        String getTranscriptSetIds();

        void setTranscriptSetIds(String transcriptSetIds);

        @Description("The IDs of the Google Genomics variant annotation sets this pipeline is working "
                + "with, comma delimited.")
        @Default.String("")
        String getVariantAnnotationSetIds();

        void setVariantAnnotationSetIds(String variantAnnotationSetIds);

        @Description("This provides whether all the annotation sets support Chromosome M or not. Default is false. ")
        boolean getSupportChrM();

        void setSupportChrM(boolean supportChrM);

        @Description("This option helps to redirect the output of annottaion process to BigQuery or Google Storage. Default is false (Google Storage).")
        boolean getBigQuerySort();

        void setBigQuerySort(boolean BigQuerySort);

        @Description("Genomic window \"bin\" size to use for grouping variants")
        @Default.Integer(1000000)
        int getBinSize();

        void setBinSize(int BinSize);

        @Description("If you want to only annotate SNPs, set this value true. Default is false. ")
        boolean getOnlySNP();

        void setOnlySNP(boolean onlySNP);

        @Description("If you want to print out VCF info column, set this value true. Default is false. ")
        boolean getPrintVCFInfo();

        void setPrintVCFInfo(boolean printVCFInfo);

        @Description("This provides the path to the local output file.")
        @Default.String("")
        String getLocalOutputFilePath();

        void setLocalOutputFilePath(String LocalOutputFilePath);

        @Description("This provides BigQuery Dataset ID.")
        @Default.String("")

        String getBigQueryDatasetId();

        void setBigQueryDatasetId(String BigQueryDatasetId);

        @Description("This provides BigQuery Table.")
        @Default.String("")
        String getBigQueryTable();

        void setBigQueryTable(String BigQueryTable);

        public static class Methods {
            public static void validateOptions(Options options) {
                GCSOutputOptions.Methods.validateOptions(options);
            }
        }

    }

    private static final Logger LOG = Logger.getLogger(GGAnnotateVariants.class.getName());

    /*
     * To visit variants efficiently, the program creates
     * "VariantStreamIterator" by considering "VARIANT_FIELDS". VARIANT_FIELDS
     * represents the structure of response. By specifying VARIANT_FIELDS, we
     * can avoid loading extra information. Tip: Use the API explorer to test
     * which fields to include in partial responses.
     * https://developers.google.com/apis-explorer/#p/genomics/v1/genomics.
     * variants.stream
     */
    private static final String VARIANT_FIELDS = "variants(id,referenceName,start,end,alternateBases,referenceBases,calls,quality)";

    private final OfflineAuth auth;
    private final List<String> callSetIds, transcriptSetIds, variantAnnotationSetIds;
    private final HashMap<String, Integer> VariantColInfo;
    private final HashMap<String, Integer> TranscriptColInfo;
    private final boolean supportChrM;
    private final boolean onlySNP;
    private final boolean BigQuery;

    public GGAnnotateVariants(OfflineAuth auth, List<String> callSetIds, List<String> transcriptSetIds,
            List<String> variantAnnotationSetIds, HashMap<String, Integer> _VariantColInfo,
            HashMap<String, Integer> _TranscriptColInfo, boolean _supportChrM, boolean _onlySNP,
            boolean _bigQuery) {
        this.auth = auth;
        this.callSetIds = callSetIds;
        this.transcriptSetIds = transcriptSetIds;
        this.variantAnnotationSetIds = variantAnnotationSetIds;
        this.VariantColInfo = _VariantColInfo;
        this.TranscriptColInfo = _TranscriptColInfo;
        this.supportChrM = _supportChrM;
        this.onlySNP = _onlySNP;
        this.BigQuery = _bigQuery;
    }

    @org.apache.beam.sdk.transforms.DoFn.ProcessElement
    public void processElement(DoFn<StreamVariantsRequest, KV<String, String>>.ProcessContext c) throws Exception {

        Genomics genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth);

        StreamVariantsRequest request = StreamVariantsRequest.newBuilder(c.element()).addAllCallSetIds(callSetIds)
                .build();

        if (canonicalizeRefName(request.getReferenceName()).equals("M") && supportChrM == false) {
            LOG.info("There is no information about Chr M in the provided AnnotationSet!");
            return;
        }

        Iterator<StreamVariantsResponse> streamVariantIter = VariantStreamIterator.enforceShardBoundary(auth,
                request, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS);

        if (!streamVariantIter.hasNext()) {
            LOG.info("region has no variants, skipping");
            return;
        }

        Stopwatch stopwatch = Stopwatch.createStarted();
        int varCount = 0;

        ListMultimap<Range<Long>, Annotation> variantAnnotationSetList = null;
        if (this.variantAnnotationSetIds != null)
            variantAnnotationSetList = retrieveVariantAnnotations(genomics, request);

        IntervalTree<Annotation> transcripts = null;
        if (this.transcriptSetIds != null)
            transcripts = retrieveTranscripts(genomics, request);

        while (streamVariantIter.hasNext()) {
            Iterable<Variant> varIter;
            if (onlySNP)
                varIter = FluentIterable.from(streamVariantIter.next().getVariantsList())
                        .filter(VariantUtils.IS_SNP);
            else
                varIter = FluentIterable.from(streamVariantIter.next().getVariantsList());

            for (Variant variant : varIter) {
                Range<Long> pos = Range.closedOpen(variant.getStart(), variant.getEnd());

                // This variable helps to keep track of alignment
                String VCFOutput = "";

                // Keep track of Empty VCF records
                boolean EmptyVCF = false;

                // Variant Annotation Section
                if (variantAnnotationSetList != null) {

                    // Sort the list of matched annotations
                    SortedSet<String> VariantAnnotationKeys = new TreeSet<String>(VariantColInfo.keySet());

                    // Retrieve a list of matched variant annotations
                    List<Annotation> listMatchedAnnotations = variantAnnotationSetList.get(pos);

                    // Visit overlapped annotations in order, and the matches in
                    // order (First convert to VCF format, and then add it to
                    // VCFOutput);
                    int index = 0;
                    for (String key : VariantAnnotationKeys) {
                        // The following variables help to put a semicolon
                        // between multiple matches from the same annotationSet
                        // e.g., allele_freq1;allele_freq2;...;allele_freqn;
                        boolean SemiColon = false;

                        for (Annotation match : listMatchedAnnotations) {
                            if (match.getAnnotationSetId().compareTo(key) == 0) {
                                // if (match.getVariant().getAlternateBases() !=
                                // null
                                // && variant.getAlternateBasesList() != null)
                                {
                                    // check if Variant's alternate bases are
                                    // the same as the matched annotation's
                                    // alternate bases

                                    if (compareAlternateBases(match.getVariant().getAlternateBases(),
                                            variant.getAlternateBasesList(), variant.getReferenceBases())) {

                                        EmptyVCF = true;

                                        if (DEBUG)
                                            LOG.info("MATCHED: variant: (" + variant.getStart() + ", Annotation: "
                                                    + match.getStart() + ") ");

                                        if (!SemiColon) {
                                            VCFOutput += createVCFFormat(variant, match);
                                            SemiColon = true;
                                            // Activate it for the next matched
                                            // element

                                            // TESTING
                                            VCFOutput += "ALT:" + match.getVariant().getAlternateBases() + "\t";
                                        } else {
                                            VCFOutput += ";" + createVCFFormat(variant, match);

                                            // TESTING
                                            VCFOutput += "ALT:" + match.getVariant().getAlternateBases() + "\t";
                                        }
                                    }
                                }
                            }
                        }
                        index++;
                        /*
                         * formatTabs function helps to keep track of alignment
                         * in the VCF format (e.g., if there is no match for
                         * Variant X in AnnotationSet Y then add spaces equals
                         * to the number of AnnotationSet Y's columns in the VCF
                         * file)
                         */
                        if (VCFOutput.isEmpty()
                                && (VariantAnnotationKeys.size() > index || TranscriptColInfo.size() > 0)) {
                            VCFOutput += formatTabs(VariantColInfo.get(key));
                        }
                    } // end of keys
                    if (!EmptyVCF)
                        VCFOutput = "";
                } // End of Variant Annotation

                // Transcript Annotation Section
                if (transcripts != null) {

                    // Find all the overlapped matches and create an interval
                    // tree
                    Iterator<Node<Annotation>> transcriptIter = transcripts
                            .overlappers(pos.lowerEndpoint().intValue(), pos.upperEndpoint().intValue() - 1); // Inclusive.

                    Iterator<Node<Annotation>> StartPoint = transcriptIter;
                    if (transcriptIter != null) {
                        // Sort the list of matched annotations
                        SortedSet<String> transcriptKeys = new TreeSet<String>(TranscriptColInfo.keySet());
                        int index = 0;
                        // Check annotations in order, and in the case of match
                        // convert the matches to VCF format
                        for (String key : transcriptKeys) {
                            transcriptIter = StartPoint;
                            boolean SemiColon = false;
                            while (transcriptIter.hasNext()) {
                                Annotation transcript = transcriptIter.next().getValue();
                                if (transcript.getAnnotationSetId().compareTo(key) == 0) {
                                    if (!SemiColon) {
                                        VCFOutput += createVCFFormat(variant, transcript);
                                        SemiColon = true;
                                    } else
                                        VCFOutput += ";" + createVCFFormat(variant, transcript);
                                }
                            }
                            index++;

                            if (VCFOutput.isEmpty() && transcriptKeys.size() > index) {
                                VCFOutput += formatTabs(TranscriptColInfo.get(key));
                            }
                        }
                    }
                } // End of Transcripts

                String varintALTs = "";
                for (int index = 0; index < variant.getAlternateBasesCount(); index++) {
                    if (index > 0)
                        varintALTs += ",";
                    varintALTs += variant.getAlternateBases(index);
                }

                // The following section helps to add genotypes
                /*
                 * String VariantGenotype=""; List<VariantCall> Genotypes =
                 * variant.getCallsList();
                 * 
                 * for(String CId: callSetIds){ for(VariantCall VC:Genotypes){
                 * if(VC.getCallSetId().equals(CId)){
                 * 
                 * List<Integer> GentotypeList = VC.getGenotypeList(); for(int
                 * index=0; index < GentotypeList.size(); index++){ int Genotype
                 * = GentotypeList.get(index);
                 * 
                 * if(index>0) VariantGenotype += "/";
                 * 
                 * VariantGenotype += Genotype; } } } VariantGenotype += "\t"; }
                 */
                // Map<String, ListValue> VariantInfoMap = variant.getInfo();
                /*
                 * String VariantInfo=""; List<VariantCall> VariantCall =
                 * variant.getCallsList(); for (Iterator<VariantCall> iter =
                 * VariantCall.iterator(); iter.hasNext(); ) { VariantCall
                 * element = iter.next(); Map<String, ListValue> VariantCallInfo
                 * = element.getInfo(); for (Map.Entry<String, ListValue> entry
                 * : VariantCallInfo.entrySet()) { VariantInfo +=entry.getKey()
                 * + ":" +
                 * entry.getValue().getValuesList().get(0).getStringValue() +
                 * ";"; } }
                 * 
                 * 
                 * 
                 * /* for (Map.Entry<String, ListValue> entry :
                 * VariantInfoMap.entrySet()) { //System.out.println("Key = " +
                 * entry.getKey() + ", Value = " + entry.getValue());
                 * VariantInfo += entry.getKey() + ":" + entry.getValue() + ";";
                 * }
                 */

                /*
                 * Emit the information in the form of <Key, Value> Print out
                 * the variant w/ or w/o any matched annotations Key: (ChromId,
                 * Start, End) Value:(variant's <referenceName start end
                 * referenceBases alternateBases quality>, + The content of
                 * "VCFOutput" OR Annotation's fields
                 */
                if (this.BigQuery) {
                    if (!VCFOutput.isEmpty()) {
                        c.output(KV.of(
                                variant.getReferenceName() + ";" + Long.toString(variant.getStart()) + ";"
                                        + Long.toString(variant.getEnd()),
                                // Value
                                VCFOutput));
                    }
                } else {
                    if (!VCFOutput.isEmpty()) {
                        c.output(KV.of(
                                variant.getReferenceName() + ";" + Long.toString(variant.getStart()) + ";"
                                        + Long.toString(variant.getEnd()),
                                // Value
                                variant.getReferenceName()
                                        // <-- increment by 1 => convert to 1-based
                                        // -->
                                        + "\t" + (variant.getStart() + 1) + "\t" + variant.getEnd() + "\t"
                                        + variant.getReferenceBases() + "\t" + varintALTs
                                        // + "\t" + VariantInfo
                                        // + "\t" + variant.getQuality()
                                        // + "\t" + VariantGenotype
                                        + "\t" + VCFOutput));
                    }
                }

                varCount++;
                if (varCount % 1e3 == 0) {
                    LOG.info(String.format("read %d variants (%.2f / s)", varCount,
                            (double) varCount / stopwatch.elapsed(TimeUnit.SECONDS)));
                }
            }

        }

        LOG.info("finished reading " + varCount + " variants in " + stopwatch);
    }

    /**
     * <h1>This function compares Variant's alternateBases with the potential
     * matched annotation.
     * 
     * @param alternateBases
     *            The potential matched annotation's alternateBases
     * @param variantAlternateBasesList
     *            The variant's alternate bases
     * @return
     */
    private boolean compareAlternateBases(String annotationAlternateBases, List<String> variantAlternateBasesList,
            String variantReferenceBases) {

        if ((annotationAlternateBases.isEmpty() && variantAlternateBasesList.isEmpty())
                || (variantAlternateBasesList.isEmpty() && annotationAlternateBases.equals(" "))) {
            LOG.info("Empty Matched!");
            return true;
        }

        // String variantString = "";

        for (String variantAlternateBases : variantAlternateBasesList) {

            if (variantAlternateBases.equals(annotationAlternateBases))
                return true;

            if (variantReferenceBases.concat(annotationAlternateBases).equals(variantAlternateBases))
                return true;
            /*
             * // Variant is a part of the annotation // Annotation A->AC //
             * Variant A->C else
             * if(annotationAlternateBases.contains(variantReferenceBases))
             * return true; ///String p1 = variantReferenceBases.substring(1,
             * variantReferenceBases.length()); String p2 =
             * variantAlternateBases.substring(1,
             * variantAlternateBases.length()); String p3 =
             * variantAlternateBases.substring(0, 1); if(p1.equals(p2) &&
             * p3.equals(annotationAlternateBases)) { return true; }
             * 
             * // Annotation is a part of the variant else
             * if(variantReferenceBases.contains(annotationAlternateBases))
             * 
             * return true;
             */
        }

        return false;

        /*
         * if (alternateBases.equals(vString)) { LOG.info(
         * "Matched: Variant Alt: " + vString + ", Annotation Alt:" +
         * alternateBases);
         * 
         * return true; }
         * 
         * LOG.info("Not Matched: Variant Alt: " + vString + ", Annotation Alt:"
         * + alternateBases);
         * 
         * return false;
         */
    }

    /**
     * <h1>This function prepares the matched annotation's information in the
     * form of VCF (extended column).
     * 
     * @param variant
     * @param match
     *            The matched annotation
     * @return AnnotationMap VCFFormat of annotation
     */
    private String createVCFFormat(Variant variant, Annotation match) {
        /*
         * VCF format: #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1
         * Sample2 Sample3
         */

        String AnnotationMap = "";

        /* Sorting Annotation Info items based on key */
        SortedSet<String> keys = new TreeSet<String>(match.getInfo().keySet());
        for (String key : keys) {
            String stringValues = "";
            List<Object> objectValue = match.getInfo().get(key);
            for (Object value : objectValue)
                stringValues += value.toString();
            // AnnotationMap += key + "=" + stringValues + ";"; //INFO Field in
            // VCF
            if (DEBUG)
                AnnotationMap += key + "=" + stringValues + "\t"; // Extended
            // Column
            else
                AnnotationMap += stringValues + "\t"; // Extended Column
        }

        if (DEBUG) {
            if (match.getType().equalsIgnoreCase("VARIANT"))
                AnnotationMap += "\t Variant (Ref: " + variant.getReferenceBases() + ", Alt: "
                        + variant.getAlternateBasesList().toString() + ") " + "Annotation ("
                        + match.getVariant().getAlternateBases() + ")";

            AnnotationMap += "\t Annotation[S:" + match.getStart() + ", E:" + match.getEnd() + ")";
        }

        return AnnotationMap;
    }

    /**
     * <h1>This function first send a request to Google Genomics to receive all
     * Generic/Transcript annotations (transcriptSetIds), and after receiving
     * the information, it creates and return back an interval tree of all the
     * overlapped annotations.
     * 
     * @param genomics
     *            Genomics
     * @param request
     *            StreamVariantsRequest
     * @return transcripts IntervalTree<Annotation>
     */

    private IntervalTree<Annotation> retrieveTranscripts(Genomics genomics, StreamVariantsRequest request) {
        Stopwatch stopwatch = Stopwatch.createStarted();
        IntervalTree<Annotation> transcripts = new IntervalTree<>();
        Iterable<Annotation> transcriptIter = Paginator.Annotations
                .create(genomics, ShardBoundary.Requirement.OVERLAPS)
                .search(new SearchAnnotationsRequest().setAnnotationSetIds(transcriptSetIds)
                        .setReferenceName(canonicalizeRefName(request.getReferenceName()))
                        .setStart(request.getStart()).setEnd(request.getEnd()));
        for (Annotation annotation : transcriptIter) {
            transcripts.put(annotation.getStart().intValue(), annotation.getEnd().intValue(), annotation);
        }
        LOG.info(String.format("read %d transcripts in %s (%.2f / s)", transcripts.size(), stopwatch,
                (double) transcripts.size() / stopwatch.elapsed(TimeUnit.SECONDS)));
        return transcripts;
    }

    /**
     * <h1>This function first send a request to Google Genomics to receive all
     * variant annotations (variantAnnotationSetIds), and after receiving the
     * information, it creates and return back a ListMultimap that contains all
     * the overlapped annotations plus their positions.
     * 
     * @param genomics
     *            Genomics
     * @param request
     *            StreamVariantsRequest
     * @return annotationMap ListMultimap<Range<Long>, Annotation>
     */

    private ListMultimap<Range<Long>, Annotation> retrieveVariantAnnotations(Genomics genomics,
            StreamVariantsRequest request) {
        Stopwatch stopwatch = Stopwatch.createStarted();
        ListMultimap<Range<Long>, Annotation> annotationMap = ArrayListMultimap.create();
        Iterable<Annotation> annotationIter = Paginator.Annotations
                .create(genomics, ShardBoundary.Requirement.OVERLAPS)
                .search(new SearchAnnotationsRequest().setAnnotationSetIds(variantAnnotationSetIds)
                        .setReferenceName(canonicalizeRefName(request.getReferenceName()))
                        .setStart(request.getStart()).setEnd(request.getEnd()));

        for (Annotation annotation : annotationIter) {
            long start = 0;
            if (annotation.getStart() != null) {
                start = annotation.getStart();

                if (annotation.getStart() == 126310)
                    LOG.info("HERE is the annotation: " + annotation.toString() + ") ");

            }
            annotationMap.put(Range.closedOpen(start, annotation.getEnd()), annotation);
        }

        LOG.info(String.format("read %d variant annotations in %s (%.2f / s)", annotationMap.size(), stopwatch,
                (double) annotationMap.size() / stopwatch.elapsed(TimeUnit.SECONDS)));

        return annotationMap;
    }

    /**
     * <h1>This function remove "chr" from the beginning of referenceName (e.g,
     * --references=chr17:1:81000000, chr17 becomes 17)
     * 
     * @param refName
     *            Reference Name
     * @return refName Reference Name w/o "chr"
     */

    private static String canonicalizeRefName(String refName) {
        return refName.replace("chr", "");
    }

    /**
     * <h1>This function is the main function that creates and calls dataflow
     * pipeline
     */
    public static void run(String[] args) throws Exception {

        // Register the options so that they show up via --help
        PipelineOptionsFactory.register(Options.class);
        Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
        // Option validation is not yet automatic, we make an explicit call
        // here.
        Options.Methods.validateOptions(options);

        // Set up the prototype request and auth.
        StreamVariantsRequest prototype = CallSetNamesOptions.Methods.getRequestPrototype(options);
        OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options);
        Genomics genomics = GenomicsFactory.builder().build().fromOfflineAuth(auth);

        // check whether user provided VariantSetId
        if (options.getVariantSetId().isEmpty()) {
            throw new IllegalArgumentException(
                    "VaraiantSetIds must be specified (e.g., 10473108253681171589 for 1000 Genomes.)");
        }

        // check whether user provided CallSetNames
        if (options.getCallSetNames().isEmpty()) {
            throw new IllegalArgumentException("CallSetNames must be specified (e.g., HG00261 for 1000 Genomes.)");
        }

        // Add Genotype field to the VCF HEADER!
        addGenotypetoHeader(options.getCallSetNames());

        if (options.getVariantAnnotationSetIds().isEmpty() && options.getTranscriptSetIds().isEmpty()) {
            throw new IllegalArgumentException(
                    "Both VaraiantAnnotationSetIds/TranscriptSetIds are empty! At least one of them must be specified."
                            + "(e.g., CIjfoPXj9LqPlAEQ5vnql4KewYuSAQ for UCSC refGene (hg19) and CILSqfjtlY6tHxC0nNH-4cu-xlQ for ClinVar (GRCh37))");
        }

        List<String> callSetIds = CallSetNamesOptions.Methods.getCallSetIds(options);

        List<String> variantAnnotationSetIds = null;
        if (!Strings.isNullOrEmpty(options.getVariantAnnotationSetIds())) {
            variantAnnotationSetIds = validateAnnotationSetsFlag(genomics, options.getVariantAnnotationSetIds(),
                    "VARIANT");
            prepareHeaderAnnotationSets(genomics, variantAnnotationSetIds, 5, "VARIANT");
            /*
             * The first five columns are redundant [referenceName, start, end,
             * referenceBases, alternateBases] are provided by variants
             */
        }

        List<String> transcriptSetIds = null;
        // TODO: Transcript and Generic should be separated
        if (!Strings.isNullOrEmpty(options.getTranscriptSetIds())) {
            transcriptSetIds = validateAnnotationSetsFlag(genomics, options.getTranscriptSetIds(), "GENERIC");
            prepareHeaderAnnotationSets(genomics, transcriptSetIds, 3, "GENERIC");
            /*
             * The first three columns are redundant - [referenceName, start,
             * end, referenceBases, alternateBases] are provided by variants
             */
        }

        List<StreamVariantsRequest> requests = options.isAllReferences()
                ? ShardUtils.getVariantRequests(prototype, ShardUtils.SexChromosomeFilter.INCLUDE_XY,
                        options.getBasesPerShard(), auth)
                : ShardUtils.getVariantRequests(prototype, options.getBasesPerShard(), options.getReferences());

        System.out.println("The Num of Extensible Columns: " + getNumCols());
        System.out.println("ChrM: " + options.getSupportChrM());

        // Here is the dataflow pipeline
        Pipeline p = Pipeline.create(options);
        p.getCoderRegistry().registerCoderForClass(Annotation.class,
                (Coder<Annotation>) GenericJsonCoder.of(Annotation.class));
        p.getCoderRegistry().registerCoderForClass(AnnotationSet.class,
                (Coder<AnnotationSet>) GenericJsonCoder.of(AnnotationSet.class));
        p.getCoderRegistry().registerCoderForClass(ListBasesResponse.class,
                (Coder<ListBasesResponse>) GenericJsonCoder.of(ListBasesResponse.class));
        p.getCoderRegistry().registerCoderForClass(SearchAnnotationsRequest.class,
                (Coder<SearchAnnotationsRequest>) GenericJsonCoder.of(SearchAnnotationsRequest.class));
        p.getCoderRegistry().registerCoderForClass(VariantAnnotation.class,
                (Coder<VariantAnnotation>) GenericJsonCoder.of(VariantAnnotation.class));

        long tempEstimatedTime;
        // START - Reset start for dataflow pipeline
        long startTime = System.currentTimeMillis();

        ////////////////////////////// Redirect Output to Google Storage ////////////////////////////////////
        if (!options.getBigQuerySort()) {
            p.begin().apply(Create.of(requests))
                    .apply("Annotate Variants",
                            ParDo.of(new GGAnnotateVariants(auth, callSetIds, transcriptSetIds,
                                    variantAnnotationSetIds, ColInfoVariantAnnotation, ColInfoTranscript,
                                    options.getSupportChrM(), options.getOnlySNP(), options.getBigQuerySort())))

                    .apply("Print Variants", ParDo.of(new DoFn<KV<String, String>, String>() {
                        private static final long serialVersionUID = -5065486967339199473L;

                        @org.apache.beam.sdk.transforms.DoFn.ProcessElement
                        public void processElement(ProcessContext c) {
                            if (!DEBUG) {
                                c.output(c.element().getValue());
                            } else {
                                c.output(c.element().getKey() + ": " + c.element().getValue());
                            }
                        }
                    })).apply(TextIO.write().to(options.getOutput()));

        } else {
            //////////////////////////////////////// BIG-QUERY [Google Genomics APIs]////////////////////

            // check whether user provided BigQueryDatasetID
            if (options.getBigQueryDatasetId().isEmpty()) {
                throw new IllegalArgumentException("BigQueryDataset must be specified (e.g., my_dataset)");
            }

            // check whether user provided BigQueryTable
            if (options.getBigQueryTable().isEmpty()) {
                throw new IllegalArgumentException("BigQuery Table must be specified (e.g., my_table)");
            } else {
                // check whether user provided the Path to the local output file 
                if (options.getLocalOutputFilePath().isEmpty()) {
                    throw new IllegalArgumentException("e.g., /home/user/output.vcf");
                }
            }

            TableReference tableRef = new TableReference();
            tableRef.setProjectId(options.getProject());
            tableRef.setDatasetId(options.getBigQueryDatasetId());
            tableRef.setTableId(options.getBigQueryTable());

            p.begin().apply(Create.of(requests))
                    .apply(ParDo.of(new GGAnnotateVariants(auth, callSetIds, transcriptSetIds,
                            variantAnnotationSetIds, ColInfoVariantAnnotation, ColInfoTranscript,
                            options.getSupportChrM(), options.getOnlySNP(), options.getBigQuerySort())))
                    .apply(ParDo.of(new FormatStatsFn()))
                    .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(getSchema())
                            .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
                            .withWriteDisposition(WriteDisposition.WRITE_APPEND));

        }

        p.run().waitUntilFinish();

        //END - RUN time 
        tempEstimatedTime = System.currentTimeMillis() - startTime;
        System.out.println("Execution Time for Pipeline: " + tempEstimatedTime);
        /////////////////////////////////END - RUN time ///////////////////////////////////////////

        if (options.getBigQuerySort()) {

            /////////////////////// START - Reset start for Sorting/Downloading //////////////////////////////
            startTime = System.currentTimeMillis();

            //         BigQueryFunctions.sortAllAndPrintDataflow(options.getProject(), options.getBigQueryDataset(),
            //               options.getBigQueryTable(), options.getLocalOutputFilePath());

            ///////////////////////////////// END - RUN time ////////////////////////////////////////
            //TODO: This is a dynamic sort -> first find the number of annotated variants, 
            // then dynamically partition based on chromosome and interval, and sort each interval     
            //BigQueryFunctions.sort(options.getProject(), options.getBigQueryDataset(), 
            //      options.getBigQueryTable(), options.getLocalOutputFilePath());

            try {
                BigQueryFunctions.sortByBin(options.getProject(), options.getBigQueryDatasetId(),
                        options.getBigQueryTable(), options.getLocalOutputFilePath(), options.getBinSize());
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            //////////////////////////////Remove the intermediate table used for sorting ///////////// 
            //BigQueryFunctions.deleteTable(options.getBigQueryDataset(), options.getBigQueryTable());

            tempEstimatedTime = System.currentTimeMillis() - startTime;
            System.out.println("Execution Time for Sort and Transfer: " + tempEstimatedTime);

        } else {

            Path VCF_Filename = Paths.get(options.getOutput());
            System.out.println("");
            System.out.println("");
            System.out.println("[INFO]------------------------------------------------------------------------");
            System.out.println("[INFO] Below is the header of " + VCF_Filename.getFileName().toString());
            System.out.println(HEADER);
            System.out.println();
            System.out.println(
                    "[INFO] To check the current status of your job, use the following command under 'DataflowPipelineRunner':");
            System.out.println("\t ~: gcloud alpha dataflow jobs describe $YOUR_JOB_ID$");
            System.out.println("");
            System.out.println(
                    "[INFO] To gather the results into a single VCF file after the completion of job (i.e., currentState:JOB_STATE_DONE), run the following command:");
            System.out.println("\t ~: gsutil cat " + options.getOutput() + "* | sort > "
                    + VCF_Filename.getFileName().toString());
            System.out.println(
                    "\t ~: you can also install GNU parallel sort in linux or MAC (e.g., in MAC: brew coreutils) and leverage parallel sort:");
            System.out.println("\t ~: gsutil cat " + options.getOutput()
                    + "* | awk '{ print length($1), $0 }' | sort -k 2,1 -n | awk '{$1 = \"\"; print}' > "
                    + VCF_Filename.getFileName().toString());
            //System.out.println("\t ~: gsutil cat " + options.getOutput() + "* | gsort --parallel=N -s > " + VCF_Filename.getFileName().toString());
            System.out.println("");
            System.out
                    .println("[INFO] To remove the output files from the cloud storage run the following command:");
            System.out.println("\t ~: gsutil rm " + options.getOutput() + "*");
            System.out.println("[INFO] ------------------------------------------------------------------------");
            System.out.println("");
            System.out.println("");

        }

    }

    private static void addGenotypetoHeader(String optionCallSetNames) {
        String callSetNames = "";
        for (String name : callSetNames.split(",")) {
            callSetNames += name + "\t";
        }
        HEADER += "\t" + callSetNames;
    }

    private static String formatTabs(int num) {
        String output = "";
        for (int i = 0; i < num; i++)
            output += "\t";
        return output;
    }

    /**
     * <h1>This method prepares the header for the output VCF file
     * 
     * @param genomics
     * @param annosetIds
     *            This is the input annotaionIds
     * @param redundant
     * @param annotationType
     */

    private static void prepareHeaderAnnotationSets(Genomics genomics, List<String> annosetIds, int redundant,
            String annotationType) throws IOException {

        for (String annosetId : annosetIds) {
            Map<String, List<Object>> gotInfo = genomics.annotationsets().get(annosetId).execute().getInfo();
            List<Object> objectValue = gotInfo.get("header");

            if (objectValue != null) {
                String[] fields = objectValue.get(0).toString().split(",");

                for (int index = redundant; index < fields.length; index++) {
                    HEADER += "\t" + fields[index];
                }

                addColInfo(annosetId, fields.length - redundant, annotationType);
            } else {
                throw new IllegalArgumentException("No header found for the annotionSet " + annosetId);
            }

        }
    }

    /**
     * <h1>This method checks whether the types of annotationSets and variantSet
     * are the same or not. In the case of matched, it will return back a list
     * of annotionIds
     * 
     * @param genomics
     * @param flagValue
     *            This is the input annotaionIds
     * @param wantType
     * @return annosetIds
     */
    private static List<String> validateAnnotationSetsFlag(Genomics genomics, String flagValue, String wantType)
            throws IOException {
        List<String> annosetIds = ImmutableList.copyOf(flagValue.split(","));
        for (String annosetId : annosetIds) {
            AnnotationSet annoset = genomics.annotationsets().get(annosetId).execute();
            if (!wantType.equals(annoset.getType())) {
                throw new IllegalArgumentException("annotation set " + annosetId + " has type " + annoset.getType()
                        + ", wanted type " + wantType);
            }
        }
        return annosetIds;
    }

    /**
     * <h1>This function visits all input annotation sets, and calculate the
     * maximum number of extended columns.
     * 
     * @return NumCols The function returns back the maximum number of columns
     *         that will be added after the annotation to the VCF file.
     */

    public static int getNumCols() {
        int NumCols = 0;

        for (Map.Entry<String, Integer> entry : ColInfoTranscript.entrySet()) {
            String key = entry.getKey();
            int value = entry.getValue();
            System.out.println("Transcript AnnotationSetId: " + key + "\t Number of Columns: " + value);
            NumCols += value;
        }

        for (Map.Entry<String, Integer> entry : ColInfoVariantAnnotation.entrySet()) {
            String key = entry.getKey();
            int value = entry.getValue();
            System.out.println("Variant AnnotationSetId: " + key + "\t Number of Columns: " + value);
            NumCols += value;
        }
        return NumCols;
    }

    /**
     * <h1>This function simply adds information <AnnotationSetId, Max. number
     * of extended columns>. The information is used to aligned VCF columns.
     */

    public static void addColInfo(String asId, int numCols, String type) {
        if (type.equalsIgnoreCase("VARIANT"))
            ColInfoVariantAnnotation.put(asId, numCols);
        else if (type.equalsIgnoreCase("TRANSCRIPT") || type.equalsIgnoreCase("GENERIC"))
            ColInfoTranscript.put(asId, numCols);
    }

    /**
     * Defines the BigQuery schema used for the output.
     */
    static TableSchema getSchema() {
        List<TableFieldSchema> fields = new ArrayList<>();
        fields.add(new TableFieldSchema().setName("chrm").setType("STRING"));
        fields.add(new TableFieldSchema().setName("start").setType("INTEGER"));
        fields.add(new TableFieldSchema().setName("end").setType("INTEGER"));
        fields.add(new TableFieldSchema().setName("info").setType("STRING"));
        TableSchema schema = new TableSchema().setFields(fields);
        return schema;
    }

    /**
     * Format the results to a TableRow, to save to
     * BigQuery.
     */
    static class FormatStatsFn extends DoFn<KV<String, String>, TableRow> {
        /**
         * 
         */
        private static final long serialVersionUID = -7241249982422720375L;

        @org.apache.beam.sdk.transforms.DoFn.ProcessElement
        public void processElement(ProcessContext c) {
            String[] key = c.element().getKey().toString().split(";");
            TableRow row = new TableRow().set("chrm", key[0]).set("start", key[1]).set("end", key[2]).set("Info",
                    c.element().getValue().toString());
            c.output(row);
        }
    }

}