Java tutorial
/** * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * * ************************************************************************** * NOTICE This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** * * Continue contributions: * Copyright 2013-2015 The MITRE Corporation. */ package org.opensextant.extractors.xtax; ///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| // //_____ ____ __ __ ///\ __`\ /\ _`\ /\ \__ /\ \__ //\ \ \/\ \ _____ __ ___ \ \,\L\_\ __ __ _\ \ ,_\ __ ___ \ \ ,_\ //\ \ \ \ \ /\ '__`\ /'__`\ /' _ `\ \/_\__ \ /'__`\/\ \/'\\ \ \/ /'__`\ /' _ `\\ \ \/ //\ \ \_\ \\ \ \L\ \/\ __/ /\ \/\ \ /\ \L\ \ /\ __/\/> </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_ //\ \_____\\ \ ,__/\ \____\\ \_\ \_\ \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\ //\/_____/ \ \ \/ \/____/ \/_/\/_/ \/_____/ \/____/\//\/_/ \/__/ \/__/\/_/ \/_/\/_/ \/__/ // \ \_\ // \/_/ // //OpenSextant TaxonMatcher //* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| //*/ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.opensextant.ConfigException; import org.opensextant.data.Taxon; import org.opensextant.data.TextInput; import org.opensextant.extraction.ExtractionException; import org.opensextant.extraction.Extractor; import org.opensextant.extraction.SolrMatcherSupport; import org.opensextant.extraction.TextMatch; import org.opensextant.util.SolrProxy; import org.opensextant.util.TextUtils; /** * TaxonMatcher uses SolrTextTagger to tag mentions of phrases in documents. The * phrases can be from simple word lists or they can connect to a taxonomy of * sorts -- the "taxcat" solr core (see Xponents/solr/taxcat and Xponents/XTax * for implementation) * * JVM arg to use is "opensextant.solr" to point to the local path Less tested: * solr.solr.home might conflict with a Solr document server instead of this * tagger. solr.url is good for RESTful integration, but not recommended * * @author Marc Ubaldino - ubaldino@mitre.org */ public class TaxonMatcher extends SolrMatcherSupport implements Extractor { private static ModifiableSolrParams params; static { params = new ModifiableSolrParams(); // params.set(CommonParams.QT, requestHandler); params.set(CommonParams.FL, "id,catalog,taxnode,phrase,tag,name_type"); params.set("tagsLimit", 100000); params.set("subTags", false); params.set("matchText", false); params.set(CommonParams.FQ, "valid:true"); /* * Possible overlaps: ALL, NO_SUB, LONGEST_DOMINANT_RIGHT See Solr Text * Tagger documentation for details. */ params.set("overlaps", "NO_SUB"); } private boolean tagAll = true; private boolean filterNonAcronyms = true; // private ProgressMonitor progressMonitor; /** * * @throws IOException * @throws ConfigException */ public TaxonMatcher() throws IOException, ConfigException { configure(); } /** * Extractor interface. */ @Override public void cleanup() { this.shutdown(); } /** * Be explicit about the solr core to use for tagging */ @Override public String getCoreName() { return "taxcat"; } /** * Return the Solr Parameters for the tagger op. * * @return solr params */ @Override public SolrParams getMatcherParameters() { return params; } /** * Create a Taxon tag, which is filtered based on established catalog * filters. * * Caller must implement their domain objects, POJOs... this callback * handler only hashes them. * * @param refData * solr doc * @return tag data */ @Override public Object createTag(SolrDocument refData) { String _cat = SolrProxy.getString(refData, "catalog"); // Filter out unused matching records. if (!tagAll && !this.catalogs.contains(_cat)) { return null; } return createTaxon(refData); } /** * Parse the taxon reference data from a solr doc and return Taxon obj. * * @param refData * solr doc * @return taxon obj */ public static Taxon createTaxon(SolrDocument refData) { Taxon label = new Taxon(); label.name = SolrProxy.getString(refData, "taxnode"); label.isAcronym = "A".equals(SolrProxy.getString(refData, "name_type")); label.catalog = SolrProxy.getString(refData, "catalog"); label.addTerm(SolrProxy.getString(refData, "phrase")); label.addTags(refData.getFieldValues("tag")); return label; } /** * Extractor interface: getName * * @return */ @Override public String getName() { return "XTax"; } @Override public void configure() throws ConfigException { try { initialize(); extract(new TextInput("__initialization___", "trivial priming of the solr pump")); } catch (Exception err) { throw new ConfigException("Failed to configure TaxMatcher", err); } } /** * Configure an Extractor using a config file named by a path * * @param patfile * configuration file path */ @Override public void configure(String patfile) throws ConfigException { throw new ConfigException("Not a valid configuration routine"); } /** * Configure an Extractor using a config file named by a URL * * @param patfile * configuration URL */ @Override public void configure(java.net.URL patfile) throws ConfigException { throw new ConfigException("Not a valid configuration routine"); } /** * Catalogs is a list of catalogs caller wants to tag for. If set, only * taxon matches with the catalog ID in this list will be returned by * tagText() */ public Set<String> catalogs = new HashSet<String>(); public void addCatalogFilters(String[] cats) { catalogs.addAll(Arrays.asList(cats)); tagAll = false; // reset(); } public void addCatalogFilter(String cat) { catalogs.add(cat); tagAll = false; } public void removeFilters() { catalogs.clear(); tagAll = true; } /** * Light-weight usage: text in, matches out. Behaviors: ACRONYMS matching * lower case terms will automatically be omitted from results. * */ @Override public List<TextMatch> extract(String input_buf) throws ExtractionException { return extractorImpl(null, input_buf); } /** * Implementation details -- use with or without the formal ID/buffer * pairing. * * @param id * doc id * @param buf * input text * @return list of matches * @throws ExtractionException */ private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException { List<TextMatch> matches = new ArrayList<TextMatch>(); String docid = (id != null ? id : NO_DOC_ID); Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100); QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap); @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags"); log.debug("TAGS SIZE = {}", tags.size()); /* * Retrieve all offsets into a long list. */ TaxonMatch m = null; // int x1 = -1, x2 = -1; int tag_count = 0; String id_prefix = docid + "#"; for (NamedList<?> tag : tags) { m = new TaxonMatch(); m.start = ((Integer) tag.get("startOffset")).intValue(); m.end = ((Integer) tag.get("endOffset")).intValue();// +1 char after // last matched // m.pattern_id = "taxtag"; ++tag_count; m.match_id = id_prefix + tag_count; // m.setText((String) tag.get("matchText")); // Not reliable. // matchText can be null. m.setText(buf.substring(m.start, m.end)); if (TextUtils.countFormattingSpace(m.getText()) > 1) { // Phrases with words broken across more than one line are not // valid matches. // Phrase with a single TAB is okay continue; } @SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids"); for (Integer solrId : taxonIDs) { Object refData = beanMap.get(solrId); if (refData == null) { continue; } /* * Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO' */ Taxon tx = (Taxon) refData; if (this.filterNonAcronyms) { if (tx.isAcronym && !m.isUpper()) { continue; } } m.addTaxon(tx); } // If the match has valid taxons add the match to the // accumulator for this document. // if (m.hasTaxons()) { matches.add(m); } } log.debug("FOUND LABELS count={}", matches.size()); return matches; } /** * "tags" are instances of the matching text spans from your input buffer * "matchingDocs" are records from the taxonomy catalog. They have all the * metadata. * * tags' ids array are pointers into matchingDocs, by Solr record ID. * * // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40, * "startOffset":38}, // { "ids":[750308, 2769912, 2770041, 10413973, * 10417546], "endOffset":49, // "startOffset":41}, // ... // * "matchingDocs":{"numFound":75, "start":0, "docs":[ // {records matching}] * */ @Override public List<TextMatch> extract(TextInput input) throws ExtractionException { return extractorImpl(input.id, input.buffer); } public static List<Taxon> search(SolrServer index, String query) throws SolrServerException { ModifiableSolrParams qp = new ModifiableSolrParams(); qp.set(CommonParams.FL, "id,catalog,taxnode,phrase,tag,name_type"); qp.set(CommonParams.Q, query); return search(index, qp); } public static List<Taxon> search(SolrServer index, SolrParams qparams) throws SolrServerException { QueryResponse response = index.query(qparams, SolrRequest.METHOD.GET); List<Taxon> taxons = new ArrayList<>(); SolrDocumentList docList = response.getResults(); for (SolrDocument solrDoc : docList) { taxons.add(createTaxon(solrDoc)); } return taxons; } /** * search the current taxonomic catalog. * * @param query * Solr "q" parameter only * @return list of taxons * @throws SolrServerException * on err */ public List<Taxon> search(String query) throws SolrServerException { return search(this.solr.getInternalSolrServer(), query); } /** * search the current taxonomic catalog. * * @param qparams * Solr parameters in full. * @return list of taxons * @throws SolrServerException * on err */ public List<Taxon> search(SolrParams qparams) throws SolrServerException { return search(this.solr.getInternalSolrServer(), qparams); } }