uk.ac.ebi.atlas.solr.query.SolrQueryService.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ebi.atlas.solr.query.SolrQueryService.java

Source

/*
 * Copyright 2008-2013 Microarray Informatics Team, EMBL-European Bioinformatics Institute
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 * For further details of the Gene Expression Atlas project, including source code,
 * downloads and documentation, please see:
 *
 * http://gxa.github.com/gxa
 */

package uk.ac.ebi.atlas.solr.query;

import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.springframework.context.annotation.Scope;
import org.springframework.util.StopWatch;
import uk.ac.ebi.atlas.bioentity.properties.BioEntityPropertyDao;
import uk.ac.ebi.atlas.model.Species;
import uk.ac.ebi.atlas.solr.BioentityProperty;
import uk.ac.ebi.atlas.solr.query.builders.SolrQueryBuilderFactory;

import javax.inject.Inject;
import javax.inject.Named;
import java.text.MessageFormat;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static com.google.common.base.Preconditions.checkArgument;
import static uk.ac.ebi.atlas.solr.BioentityType.GENE;

@Named
@Scope("singleton")
//can be singleton because HttpSolrServer is documented to be thread safe, please be careful not to add any other non thread safe state!
public class SolrQueryService {
    private static final Logger LOGGER = Logger.getLogger(SolrQueryService.class);

    public static final String BIOENTITY_IDENTIFIER_FIELD = "bioentity_identifier";
    public static final String BIOENTITY_TYPE_FIELD = "bioentity_type";
    public static final String PROPERTY_NAME_FIELD = "property_name";

    public static final String PROPERTY_EDGENGRAM_FIELD = "property_value_edgengram";

    private static final String BIOENTITY_TYPE_QUERY = "(property_name:\"ensgene\""
            + "OR property_name:\"mirna\" OR property_name:\"ensprotein\" OR property_name:\"enstranscript\") AND property_value_lower: \"{0}\"";

    private static final int PROPERTY_VALUES_LIMIT = 1000;

    private BioentityPropertyValueTokenizer bioentityPropertyValueTokenizer;

    private GxaSolrServer solrServer;

    private SolrQueryBuilderFactory solrQueryBuilderFactory;
    private BioEntityPropertyDao bioEntityPropertyDao;

    @Inject
    public SolrQueryService(BioentityPropertyValueTokenizer bioentityPropertyValueTokenizer,
            GxaSolrServer solrServer, SolrQueryBuilderFactory solrQueryBuilderFactory,
            BioEntityPropertyDao bioEntityPropertyDao) {
        this.bioentityPropertyValueTokenizer = bioentityPropertyValueTokenizer;
        this.solrServer = solrServer;
        this.solrQueryBuilderFactory = solrQueryBuilderFactory;
        this.bioEntityPropertyDao = bioEntityPropertyDao;
    }

    //(property_name:"ensgene"OR property_name:"mirna" OR property_name:"ensprotein" OR property_name:"enstranscript") AND property_value_lower: "hsa-mir-6717"
    public BioentityProperty findBioentityIdentifierProperty(String bioentityId) {
        String _bioentityId = bioentityId.replace(":", "\\:").replace("[", "\\[").replace("]", "\\]");

        String query = MessageFormat.format(BIOENTITY_TYPE_QUERY, _bioentityId);
        SolrQuery solrQuery = new SolrQuery(query);
        solrQuery.setRows(PROPERTY_VALUES_LIMIT);
        QueryResponse response = solrServer.query(solrQuery);
        List<BioentityProperty> bioentityProperties = response.getBeans(BioentityProperty.class);
        if (!bioentityProperties.isEmpty()) {

            for (BioentityProperty bioentityProperty : bioentityProperties) {
                String bioentityIdentifier = bioentityProperty.getBioentityIdentifier();
                String propertyValue = bioentityProperty.getValue();
                if (bioentityIdentifier.equals(propertyValue)) {
                    return bioentityProperty;
                }
            }
        }
        return null;
    }

    public Set<String> fetchGeneIdentifiersFromSolr(String queryString, String bioentityType, boolean toUppercase,
            String... propertyNames) {

        //eg: property_value_lower:"hsa-mir-636" AND (bioentity_type:"ensgene") AND (property_name:"mirbase_id")
        SolrQuery solrQuery = solrQueryBuilderFactory.createGeneBioentityIdentifierQueryBuilder()
                .forQueryString(queryString, false).withBioentityTypes(Sets.newHashSet(bioentityType))
                .withPropertyNames(propertyNames).build();

        return solrServer.query(solrQuery, BIOENTITY_IDENTIFIER_FIELD, toUppercase);
    }

    public Set<String> findMatureRNAIds(Set<String> geneIdentifiers) {
        Set<String> expandedIdentifiers = Sets.newHashSet();

        for (String geneIdentifier : geneIdentifiers) {
            Set<String> mirbaseIds = bioEntityPropertyDao.fetchPropertyValuesForGeneId(geneIdentifier,
                    "mirbase_id");
            String mirbaseId = mirbaseIds.size() > 0 ? mirbaseIds.iterator().next() : null;
            Set<String> matureRNAIds = fetchGeneIdentifiersFromSolr(
                    (mirbaseId != null) ? mirbaseId : geneIdentifier, "mirna", false, "hairpin_id");
            if (matureRNAIds.size() > 0) {
                expandedIdentifiers.addAll(matureRNAIds);
            } else if (mirbaseId != null) {
                expandedIdentifiers.add(mirbaseId);
            }

        }
        return expandedIdentifiers;

    }

    Set<String> findMatureRNAIds(String geneQuery) {
        return findMatureRNAIds(Sets.newHashSet(bioentityPropertyValueTokenizer.split(geneQuery)));
    }

    public Set<String> findGenesFromMirBaseIDs(Collection<String> identifiers) {
        Set<String> ensemblIDs = Sets.newHashSet();
        for (String identifier : identifiers) {
            ensemblIDs.addAll(fetchGeneIdentifiersFromSolr(identifier, "ensgene", true, "mirbase_id"));
        }
        return ensemblIDs;
    }

    public GeneQueryResponse fetchGeneIdsOrSetsGroupedByGeneQueryToken(String geneQuery, boolean exactMatch,
            String species) {

        checkArgument(StringUtils.isNotBlank(geneQuery), "Please specify a gene query");

        species = Species.convertToEnsemblSpecies(species);

        GeneQueryResponse geneQueryResponse = new GeneQueryResponse();

        //associate gene ids with each token in the query string
        for (String queryToken : bioentityPropertyValueTokenizer.split(geneQuery)) {
            Set<String> geneIds = fetchGeneIds(queryToken, exactMatch, species);
            geneQueryResponse.addGeneIds(queryToken, geneIds);
        }
        return geneQueryResponse;

    }

    public Optional<Set<String>> expandGeneQueryExactMatchIntoGeneIdsAnySpecies(String geneQuery) {
        return expandGeneQueryIntoGeneIds(geneQuery, "", true);
    }

    public Optional<Set<String>> expandGeneQueryExactMatchIntoGeneIds(String geneQuery, String species) {
        return expandGeneQueryIntoGeneIds(geneQuery, species, true);
    }

    /**
     *
     * @param specie empty string will search across all species, and return orthologs
     * @return Optional.absent if geneQuery is blank, empty Set if no genes found, otherwise Set of geneids found
     */
    public Optional<Set<String>> expandGeneQueryIntoGeneIds(String geneQuery, String specie, boolean isExactMatch) {
        if (StringUtils.isBlank(geneQuery)) {
            return Optional.absent();
        }

        LOGGER.info(String.format("<expandGeneQueryIntoGeneIds> geneQuery=" + geneQuery));

        StopWatch stopWatch = new StopWatch(getClass().getSimpleName());
        stopWatch.start();

        //resolve any gene keywords to identifiers
        Set<String> geneIds = findGeneIdsOrSets(geneQuery, isExactMatch, specie);

        Set<String> matureRNAIds = findMatureRNAIds(geneQuery);
        geneIds.addAll(matureRNAIds);

        stopWatch.stop();
        LOGGER.info(String.format("<expandGeneQueryIntoGeneIds> %s results, took %s seconds", geneIds.size(),
                stopWatch.getTotalTimeSeconds()));

        return Optional.of(geneIds);
    }

    // NB: if species = "" then will search across all species
    Set<String> findGeneIdsOrSets(String geneQuery, boolean exactMatch, String species) {

        checkArgument(StringUtils.isNotBlank(geneQuery), "Please specify a gene query");

        species = Species.convertToEnsemblSpecies(species);

        return fetchGeneIds(geneQuery, exactMatch, species);
    }

    public Set<String> fetchGeneIds(String geneQuery, boolean exactMatch, String species) {

        Stopwatch stopwatch = Stopwatch.createStarted();

        //eg: {!lucene q.op=OR df=property_value_lower}(property_value_lower:Q9NHV9) AND (bioentity_type:"mirna" OR bioentity_type:"ensgene")
        // fl=bioentity_identifier&group=true&group.field=bioentity_identifier&group.main=true
        SolrQuery solrQuery = solrQueryBuilderFactory.createGeneBioentityIdentifierQueryBuilder()
                .forQueryString(geneQuery, true).withExactMatch(exactMatch).withSpecies(species)
                .withBioentityTypes(GENE.getSolrAliases()).build();

        Set<String> geneIds = solrServer.query(solrQuery, BIOENTITY_IDENTIFIER_FIELD, false);

        stopwatch.stop();
        LOGGER.debug(String.format("Fetched gene ids for %s: returned %s results in %s secs", geneQuery,
                geneIds.size(), stopwatch.elapsed(TimeUnit.MILLISECONDS) / 1000D));

        return geneIds;
    }

}