org.mitre.opensextant.extraction.TaxonMatcher.java Source code

Introduction

Here is the source code for org.mitre.opensextant.extraction.TaxonMatcher.java
Source

/**
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 */
package org.mitre.opensextant.extraction;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;

import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.mitre.opensextant.util.FileUtility;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Connects to a Solr sever via HTTP and tags place names in document. The
 * <code>SOLR_HOME</code> environment variable must be set to the location of
 * the Solr server.
 *
 * @author Marc Ubaldino - ubaldino@mitre.org
 */
public class TaxonMatcher {

    private static String requestHandler = "/tag";
    private static ModifiableSolrParams params;
    //private final String fields = "";
    private static SolrProxy solr = null;
    private Logger log = LoggerFactory.getLogger(this.getClass());
    private boolean debug = log.isDebugEnabled();
    private boolean tag_all = true;
    private SolrTaggerRequest tag_request = null;

    /**
     *
     * @throws IOException
     */
    public TaxonMatcher() throws IOException {
        TaxonMatcher.initialize();

        // Instance variable that will have the transient payload to tag
        // this is not thread safe and is not static:
        tag_request = new SolrTaggerRequest(params, SolrRequest.METHOD.POST);

        // Pre-loading the Solr FST
        // 
        try {
            tagText("trivial priming of the solr pump", "__initialization___");
        } catch (MatcherException initErr) {
            throw new IOException("Unable to prime the tagger", initErr);
        }
    }

    protected static void initialize() throws IOException {

        if (solr != null) {
            return;
        }

        String config_solr_home = System.getProperty("solr.solr.home");
        solr = new SolrProxy(config_solr_home, "taxcat");

        params = new ModifiableSolrParams();
        params.set(CommonParams.QT, requestHandler);
        params.set(CommonParams.FL, "id,catalog,taxnode,phrase,tag");

        params.set("tagsLimit", 100000);
        params.set("subTags", false);
        params.set("matchText", false);//we've got the input doc as a string instead

        /* Possible overlaps: ALL, NO_SUB, LONGEST_DOMINANT_RIGHT
         * See Solr Text Tagger documentation for details.
         */
        params.set("overlaps", "NO_SUB");

    }

    /** Catalogs is a list of catalogs caller wants to tag for.
     * If set, only taxon matches with the catalog ID in this list will be returned by tagText()
     */
    public Set<String> catalogs = new HashSet<>();

    public void addCatalogFilters(String[] cats) {
        catalogs.addAll(Arrays.asList(cats));
        tag_all = false;
        //reset();
    }

    public void addCatalogFilter(String cat) {
        catalogs.add(cat);
        tag_all = false;
    }

    public void removeFilters() {
        catalogs.clear();
        tag_all = true;
    }

    /**
     * Close solr resources.
     */
    public static void shutdown() {
        if (solr != null) {
            solr.close();
        }
    }

    /**
     * "tags" are instances of the matching text spans from your input buffer
     * "matchingDocs" are records from the taxonomy catalog. They have all the
     * metadata.
     *
     * tags' ids array are pointers into matchingDocs, by Solr record ID.
     *
     * // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
     * "startOffset":38}, // { "ids":[750308, 2769912, 2770041, 10413973,
     * 10417546], "endOffset":49, // "startOffset":41}, // ... //
     * "matchingDocs":{"numFound":75, "start":0, "docs":[ // {records matching}]
     *
     */
    public List<TaxonMatch> tagText(String buffer, String docid) throws MatcherException {

        List<TaxonMatch> matches = new ArrayList<>();

        // Setup request to tag... 
        tag_request.input = buffer;
        QueryResponse response = null;
        try {
            response = tag_request.process(solr.getInternalSolrServer());
        } catch (Exception err) {
            throw new MatcherException("Failed to tag document", err);
        }

        // -- Process Solr Response

        SolrDocumentList docList = (SolrDocumentList) response.getResponse().get("matchingDocs");
        Map<Integer, Taxon> labelMap = new HashMap<>(docList.size());
        for (SolrDocument solrDoc : docList) {

            String _cat = SolrProxy.getString(solrDoc, "catalog");

            // Filter out unused matching records.
            if (!tag_all && !this.catalogs.contains(_cat)) {
                continue;
            }
            Taxon label = new Taxon();

            label.catalog = _cat;
            label.name = SolrProxy.getString(solrDoc, "taxnode");
            label.addTerm(SolrProxy.getString(solrDoc, "phrase"));
            label.addTags(solrDoc.getFieldValues("tag"));

            // Hashed on "id"
            Integer id = (Integer) solrDoc.getFirstValue("id");
            labelMap.put(id, label);
        }

        @SuppressWarnings("unchecked")
        List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");

        if (debug) {
            log.debug("TAGS SIZE = " + tags.size());
        }

        /**
         * Retrieve all offsets into a long list.
         */
        TaxonMatch m = null;
        int x1 = -1, x2 = -1;
        int tag_count = 0;
        String id_prefix = docid + "#";

        for (NamedList<?> tag : tags) {
            m = new TaxonMatch();
            x1 = (Integer) tag.get("startOffset");
            x2 = (Integer) tag.get("endOffset");//+1 char after last matched
            m.start = x1;
            m.end = x2;
            m.pattern_id = "taxtag";
            ++tag_count;
            m.match_id = id_prefix + tag_count;

            // Could have enabled the "matchText" option from the tagger to get
            // this, but since we already have the content as a String then
            // we might as well not make the tagger do any more work.
            m.setText(buffer.substring(x1, x2));

            @SuppressWarnings("unchecked")
            List<Integer> taxonIDs = (List<Integer>) tag.get("ids");

            for (Integer solrId : taxonIDs) {
                m.addTaxon(labelMap.get(solrId));
            }

            // If the match has valid taxons add the match to the 
            // accumulator for this document.
            // 
            if (m.hasTaxons()) {
                matches.add(m);
            }
        }

        if (debug) {
            log.debug("FOUND LABELS count=" + matches.size());
        }

        return matches;
    }

    public void testDoc(String buf) throws MatcherException {
        List<TaxonMatch> matches = this.tagText(buf, "test");

        for (TaxonMatch tx : matches) {
            System.out.println(tx.toString());
        }
    }

    /**
     * Do a basic test
     */
    public static void main(String[] args) throws Exception {
        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("TaxTagger", args, "f:");

        int c = -1;
        String file = null;
        while ((c = opts.getopt()) != -1) {

            switch (c) {
            case 'f':
                file = opts.getOptarg();
                break;

            default:
                System.out.println("Usage  -f filename ");
                System.exit(-1);
            }

        }
        TaxonMatcher taxtag = new TaxonMatcher();

        try {
            //String doc = "Fruits of paradise are like pineapple, guava, passion fruit. "+
            //        " You may abandon the calories by eating fewer than one a day";

            String doc = FileUtility.readFile(file);

            // No filters.
            taxtag.testDoc(doc);

            // Invalid filter
            System.out.println("Testing invalid catalog");
            taxtag.addCatalogFilter("Boo");
            //taxtag.reset();            
            taxtag.testDoc(doc);

            // Invalid filter + valid filter.
            System.out.println("Testing a valid catalog");
            taxtag.addCatalogFilter("CWMD");
            //taxtag.reset();
            taxtag.testDoc(doc);

            TaxonMatcher.shutdown();
        } catch (Exception err) {
            err.printStackTrace();
        }
    }
}