org.phenotips.variantstore.db.solr.SolrVariantUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.phenotips.variantstore.db.solr.SolrVariantUtils.java

Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */
package org.phenotips.variantstore.db.solr;

import org.phenotips.variantstore.db.DatabaseException;
import org.phenotips.variantstore.shared.GACallInfoFields;
import org.phenotips.variantstore.shared.GAVariantInfoFields;

import static org.phenotips.variantstore.shared.VariantUtils.addInfo;
import static org.phenotips.variantstore.shared.VariantUtils.getInfo;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.StrUtils;
import org.ga4gh.GACall;
import org.ga4gh.GAVariant;

/**
 * @version $Id: caf6e010b4736d4be6a19e906b9eeb3058dc550b $
 */
public final class SolrVariantUtils {
    /**
     * The ID of the metadocument that stores all individual ids.
     */
    public static final String METADATA_DOC_ID = "metadata";

    private SolrVariantUtils() {
        throw new AssertionError();
    }

    /**
     * Collect all the individuals into their own maps.
     *
     * @param mapList a list of maps of individual to variant.
     *
     * @return a map of callSetId to list of variants
     */
    public static Map<String, List<GAVariant>> variantListToCallsetMap(List<Map<String, GAVariant>> mapList) {
        Map<String, List<GAVariant>> callsetMap = new HashMap<>();
        for (Map<String, GAVariant> map : mapList) {
            for (String key : map.keySet()) {
                if (!callsetMap.containsKey(key)) {
                    callsetMap.put(key, new ArrayList<GAVariant>());
                }
                callsetMap.get(key).add(map.get(key));
            }
        }

        return callsetMap;
    }

    /**
     * Add the documents in a SolrDocumentList to a list of GAVariants.
     *
     * @param documentList the SolrDocumentList
     *
     * @return the list of GAVariants
     */
    public static List<Map<String, GAVariant>> documentListToMapList(SolrDocumentList documentList) {
        List<Map<String, GAVariant>> list = new ArrayList<>();

        for (SolrDocument doc : documentList) {
            Map<String, GAVariant> variantMap = docToVariantMap(doc);

            list.add(variantMap);
        }

        return list;
    }

    /**
     * Get all the individual variants out of a doc.
     *
     * @param doc the doc
     *
     * @return a map of callsetid to variant
     */
    public static Map<String, GAVariant> docToVariantMap(SolrDocument doc) {
        Map<String, GAVariant> map = new HashMap<>();

        for (String callsetId : (List<String>) doc.get(VariantsSchema.CALLSET_IDS)) {
            map.put(callsetId, docToVariant(doc, callsetId));
        }

        return map;
    }

    /**
     * Turn a SolrDocument to a GAVariant.
     *
     * @param doc       the SolrDocument
     * @param callsetId the id of the callset
     *
     * @return a new GAVariant
     */
    public static GAVariant docToVariant(SolrDocument doc, String callsetId) {
        GAVariant variant = new GAVariant();

        // TODO: Whole function needs to be newed, should be doc to list<Variant>
        variant.setReferenceName(doc.get(VariantsSchema.CHROM).toString());
        variant.setReferenceBases(doc.get(VariantsSchema.REF).toString());
        variant.setStart(Long.valueOf(doc.get(VariantsSchema.START).toString()));
        variant.setEnd(Long.valueOf(doc.get(VariantsSchema.END).toString()));
        variant.setAlternateBases(Collections.singletonList(doc.get(VariantsSchema.ALT).toString()));

        addInfo(variant, GAVariantInfoFields.GENE, doc.get(VariantsSchema.GENE));
        addInfo(variant, GAVariantInfoFields.GENE_EFFECT, doc.get(VariantsSchema.GENE_EFFECT));
        addInfo(variant, GAVariantInfoFields.GENE_HGVS, doc.get(VariantsSchema.GENE_HGVS));

        if (doc.containsKey(VariantsSchema.EXAC_AF)) {
            addInfo(variant, GAVariantInfoFields.EXAC_AF, doc.get(VariantsSchema.EXAC_AF));
        }
        if (doc.containsKey(VariantsSchema.GT_HET)) {
            addInfo(variant, GAVariantInfoFields.GT_HET, doc.get(VariantsSchema.GT_HET));
        }
        if (doc.containsKey(VariantsSchema.GT_HOM)) {
            addInfo(variant, GAVariantInfoFields.GT_HOM, doc.get(VariantsSchema.GT_HOM));
        }

        if (doc.containsKey(VariantsSchema.AC_TOT)) {
            addInfo(variant, GAVariantInfoFields.AC_TOT, doc.get(VariantsSchema.AC_TOT));
        }

        GACall call = new GACall();
        addInfo(call, GACallInfoFields.QUALITY, getCallsetField(doc, callsetId, VariantsSchema.QUAL));
        addInfo(call, GACallInfoFields.FILTER, getCallsetField(doc, callsetId, VariantsSchema.FILTER));
        addInfo(call, GACallInfoFields.EXOMISER_VARIANT_SCORE,
                getCallsetField(doc, callsetId, VariantsSchema.EXOMISER_VARIANT_SCORE));
        addInfo(call, GACallInfoFields.EXOMISER_GENE_PHENO_SCORE,
                getCallsetField(doc, callsetId, VariantsSchema.EXOMISER_GENE_PHENO_SCORE));
        addInfo(call, GACallInfoFields.EXOMISER_GENE_VARIANT_SCORE,
                getCallsetField(doc, callsetId, VariantsSchema.EXOMISER_GENE_VARIANT_SCORE));
        addInfo(call, GACallInfoFields.EXOMISER_GENE_COMBINED_SCORE,
                getCallsetField(doc, callsetId, VariantsSchema.EXOMISER_GENE_COMBINED_SCORE));
        if ((int) getCallsetField(doc, callsetId, VariantsSchema.AC) == 2) {
            call.setGenotype(Arrays.asList(1, 1));
        } else {
            call.setGenotype(Arrays.asList(0, 1));
        }
        variant.setCalls(Collections.singletonList(call));

        return variant;
    }

    /**
     * Get the value of a field on the doc thats unique to a callset (i.e. not
     * share by two callsets). For example, a variant quality indicator would be
     * specific to an individual's read.
     *
     * @param doc       the doc to add a field do
     * @param callsetId the callset this field belongs to
     * @param fieldName the name of the field
     *
     * @return the value of the field
     */
    public static Object getCallsetField(SolrDocument doc, String callsetId, String fieldName) {
        return doc.get(VariantsSchema.getCallsetsFieldName(callsetId, fieldName));
    }

    /**
     * Turn a GAVariant into a SolrDocument.
     *
     * @param variant the GAVariant
     *
     * @return the SolrDocument
     */
    public static SolrDocument variantToDoc(GAVariant variant) {
        SolrDocument doc = new SolrDocument();

        doc.setField(VariantsSchema.ID, getHash(variant));
        doc.setField(VariantsSchema.CHROM, variant.getReferenceName());
        doc.setField(VariantsSchema.START, variant.getStart());
        doc.setField(VariantsSchema.END, variant.getStart() + variant.getReferenceBases().length());
        doc.setField(VariantsSchema.REF, variant.getReferenceBases());
        doc.setField(VariantsSchema.REF_LENGTH, variant.getReferenceBases().length());
        doc.setField(VariantsSchema.ALT, variant.getAlternateBases().get(0));
        doc.setField(VariantsSchema.ALT_LENGHT, variant.getAlternateBases().get(0).length());
        doc.setField(VariantsSchema.LENGTH,
                Math.max((int) doc.get(VariantsSchema.REF_LENGTH), (int) doc.get(VariantsSchema.ALT_LENGHT)));

        doc.setField(VariantsSchema.GENE, getInfo(variant, GAVariantInfoFields.GENE));
        doc.setField(VariantsSchema.GENE_EFFECT, getInfo(variant, GAVariantInfoFields.GENE_EFFECT));
        doc.setField(VariantsSchema.GENE_HGVS, getInfo(variant, GAVariantInfoFields.GENE_HGVS));

        doc.setField(VariantsSchema.EXAC_AF, safeValueOf(getInfo(variant, GAVariantInfoFields.EXAC_AF)));

        doc.setField(VariantsSchema.AC_TOT, 0);
        doc.setField(VariantsSchema.GT_HET, 0);
        doc.setField(VariantsSchema.GT_HOM, 0);

        // initialize multi-value field so that it clones as an empty list rather than a null value
        doc.setField(VariantsSchema.CALLSET_IDS, Collections.emptyList());
        return doc;
    }

    /**
     * Add callset-specific fields from a variant to an existing document.
     *
     * @param doc       The existing document
     * @param variant   the variant
     * @param callsetId the id of the callset
     * @param isPublic  whether these variants can be used in an aggregate
     *                  search.
     */
    public static void addVariantToDoc(SolrDocument doc, GAVariant variant, String callsetId, boolean isPublic) {
        addMultiFieldValue(doc, VariantsSchema.CALLSET_IDS, callsetId);

        GACall call = variant.getCalls().get(0);
        int copies = 0;
        for (int i : call.getGenotype()) {
            if (i == 1) {
                copies++;
            }
        }
        doc.setField(VariantsSchema.AC_TOT, (int) doc.getFieldValue(VariantsSchema.AC_TOT) + copies);
        if (copies == 1) {
            doc.setField(VariantsSchema.GT_HET, (int) doc.getFieldValue(VariantsSchema.GT_HET) + 1);
        } else if (copies == 2) {
            doc.setField(VariantsSchema.GT_HOM, (int) doc.getFieldValue(VariantsSchema.GT_HOM) + 1);
        }

        setCallsetField(doc, callsetId, VariantsSchema.PUBLIC, isPublic);
        setCallsetField(doc, callsetId, VariantsSchema.AC, copies);
        setCallsetField(doc, callsetId, VariantsSchema.QUAL, getInfo(call, GACallInfoFields.QUALITY));
        setCallsetField(doc, callsetId, VariantsSchema.FILTER, getInfo(call, GACallInfoFields.FILTER));
        setCallsetField(doc, callsetId, VariantsSchema.EXOMISER_VARIANT_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_VARIANT_SCORE)));
        setCallsetField(doc, callsetId, VariantsSchema.EXOMISER_GENE_PHENO_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_GENE_PHENO_SCORE)));
        setCallsetField(doc, callsetId, VariantsSchema.EXOMISER_GENE_VARIANT_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_GENE_VARIANT_SCORE)));
        setCallsetField(doc, callsetId, VariantsSchema.EXOMISER_GENE_COMBINED_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_GENE_COMBINED_SCORE)));
    }

    /**
     * Add a field to a document.
     *
     * @param doc   the SolrDocument
     * @param key   the name of the field where the specified value to be added
     * @param value the specified value to be added
     */
    public static void addMultiFieldValue(SolrDocument doc, String key, Object value) {
        // clone array, sometimes it's unmodifiable
        List<Object> values = new ArrayList<>(doc.getFieldValues(key));
        values.add(value);
        doc.setField(key, values);
    }

    /**
     * Set a field on the doc thats unique to a callset (i.e. not share by two
     * callsets). For example, a variant quality indicator would be specific to
     * an individual's read.
     *
     * @param doc       the doc to add a field do
     * @param callsetId the callset this field belongs to
     * @param fieldName the name of the field
     * @param value     the value
     */
    static void setCallsetField(SolrDocument doc, String callsetId, String fieldName, Object value) {
        doc.setField(VariantsSchema.getCallsetsFieldName(callsetId, fieldName), value);
    }

    /**
     * Avoid NullPointerExceptions when parsing doubles.
     *
     * @param s the string
     *
     * @return a double or null
     */
    private static Double safeValueOf(String s) {
        if (s == null) {
            return null;
        }

        return Double.valueOf(s);
    }

    /**
     * Make variant signature from chr + pos + ref + alt.
     *
     * @param variant the variant
     *
     * @return the signature
     */
    public static String getHash(GAVariant variant) {
        return variant.getReferenceName() + ":" + variant.getStart().toString() + ":" + variant.getReferenceBases()
                + ":" + variant.getAlternateBases().get(0);
    }

    /**
     * Turns bytes[] into a hex String. Copied from {@link
     * org.apache.solr.update.processor.SignatureUpdateProcessorFactory}.
     *
     * @param bytes a byte array
     *
     * @return a String
     */
    private static String byteArrayToString(byte[] bytes) {
        char[] arr = new char[bytes.length << 1];

        for (int i = 0; i < bytes.length; i++) {
            int b = bytes[i];
            int idx = i << 1;
            arr[idx] = StrUtils.HEX_DIGITS[(b >> 4) & 0xf];
            arr[idx + 1] = StrUtils.HEX_DIGITS[b & 0xf];
        }
        return Arrays.toString(arr);

    }

    /**
     * Remove a variant from a document. The call set ID must be provided to
     * specify which callset to remove the variant from.
     *
     * @param doc       the SolrDocument
     * @param variant   the GAVatriant to remove
     * @param callsetId the callsetId that the variant belongs to
     * @param isPublic  whether the variant was public
     */
    public static void removeVariantFromDoc(SolrDocument doc, GAVariant variant, String callsetId,
            boolean isPublic) {

        removeMultiFieldValue(doc, VariantsSchema.CALLSET_IDS, callsetId);

        GACall call = variant.getCalls().get(0);
        int copies = 0;
        for (int i : call.getGenotype()) {
            if (i == 1) {
                copies--;
            }
        }
        doc.setField(VariantsSchema.AC_TOT, (int) doc.getFieldValue(VariantsSchema.AC_TOT) - copies);
        if (copies == 1) {
            doc.setField(VariantsSchema.AC_TOT, (int) doc.getFieldValue(VariantsSchema.GT_HET) - 1);
        } else if (copies == 2) {
            doc.setField(VariantsSchema.AC_TOT, (int) doc.getFieldValue(VariantsSchema.GT_HOM) - 1);
        }

        removeCallsetFieldValue(doc, callsetId, VariantsSchema.PUBLIC, isPublic);
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.AC, copies);
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.QUAL, getInfo(call, GACallInfoFields.QUALITY));
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.FILTER, getInfo(call, GACallInfoFields.FILTER));
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.EXOMISER_VARIANT_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_VARIANT_SCORE)));
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.EXOMISER_GENE_PHENO_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_GENE_PHENO_SCORE)));
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.EXOMISER_GENE_VARIANT_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_GENE_VARIANT_SCORE)));
        removeCallsetFieldValue(doc, callsetId, VariantsSchema.EXOMISER_GENE_COMBINED_SCORE,
                safeValueOf(getInfo(call, GACallInfoFields.EXOMISER_GENE_COMBINED_SCORE)));
    }

    /**
     * Remove a field from a document.
     *
     * @param doc   the SolrDocument
     * @param key   the name of the field from which the specified value to be removed
     * @param value the specified value to be removed
     */
    public static void removeMultiFieldValue(SolrDocument doc, String key, Object value) {
        // clone array, sometimes it's unmodifiable
        List<Object> values = new ArrayList<>(doc.getFieldValues(key));
        values.remove(value);
        doc.setField(key, values);
    }

    private static void removeCallsetFieldValue(SolrDocument doc, String callsetId, String fieldName,
            Object value) {
        doc.removeFields(VariantsSchema.getCallsetsFieldName(callsetId, fieldName));
        // since callset fields are copied to multivalued fields, we need to remove that too.
        removeMultiFieldValue(doc, fieldName, value);
    }

    /**
     * Retrieve a metadocument that stores all individual ids.
     *
     * @param server the solr server to assist communication with a Solr server
     *
     * @return a SolrDocument
     * @throws SolrServerException SolrServerException
     * @throws IOException IOException
     */
    public static SolrDocument getMetaDocument(SolrClient server) throws SolrServerException, IOException {
        SolrDocument metaDoc = server.getById(METADATA_DOC_ID);
        if (metaDoc == null) {
            metaDoc = new SolrDocument();
            metaDoc.setField(VariantsSchema.ID, METADATA_DOC_ID);
            metaDoc.setField(VariantsSchema.CALLSET_IDS, Collections.emptyList());
        }
        return metaDoc;
    }

    /**
     * Add a document to a SolrClient.
     *
     * @param doc    the SolrDocument do be added
     * @param server the solr server to assist communication with a Solr server
     * @throws DatabaseException DatabaseException
     */
    public static void addDoc(SolrInputDocument doc, SolrClient server) throws DatabaseException {
        try {
            server.add(doc);
            doc.clear();
        } catch (SolrServerException | IOException e) {
            throw new DatabaseException("Error adding variants to Solr", e);
        }
    }
}