com.searchbox.SuggeterDataStructureBuilder.java Source code

Introduction

Here is the source code for com.searchbox.SuggeterDataStructureBuilder.java
Source

/*******************************************************************************
 * Copyright Searchbox - http://www.searchbox.com
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.searchbox;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.Bits;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.LoggerFactory;

/**
 * 
 * @author andrew
 */
public class SuggeterDataStructureBuilder {

    private static org.slf4j.Logger LOGGER = LoggerFactory.getLogger(SuggesterComponent.class);
    private SentenceDetectorME sentenceDetector = null;
    private String sentenceDetectorModelName = "en-sent.bin";
    private TokenizerME tokenizer = null;
    private String tokenizerModelName = "en-token.bin";
    private HashSet<String> stopwords;
    private Analyzer analyzer;
    private String[] fields;
    public int NGRAMS;
    public int numdocs;
    public int counts[];
    private SuggesterTreeHolder suggester;

    /*------------*/
    public void Tokenizer(String filename_model) throws FileNotFoundException {
        InputStream modelIn = (getClass().getResourceAsStream("/" + filename_model));
        try {
            TokenizerModel model = new TokenizerModel(modelIn);
            tokenizer = new TokenizerME(model);
        } catch (IOException e) {
        }
    }

    /*------------*/
    // break text into tokens using opennlp model
    private String[] getTokens(String fulltext) {
        LinkedList<String> tokens = new LinkedList<String>();
        try {
            TokenStream tokenStream = analyzer.tokenStream(fields[0], new StringReader(fulltext));
            tokenStream.reset();
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

            while (tokenStream.incrementToken()) {
                String token = charTermAttribute.toString();
                tokens.add(token);
            }

        } catch (IOException ex) {
            LOGGER.error("Failure reading tokens from stream", ex);
        }
        return tokens.toArray(new String[0]);
    }

    /*------------*/
    public void SentenceParser(String filename_model) throws FileNotFoundException {
        InputStream modelIn = (getClass().getResourceAsStream("/" + filename_model));
        try {
            SentenceModel model = new SentenceModel(modelIn);
            sentenceDetector = new SentenceDetectorME(model);
        } catch (IOException e) {
        }
    }

    // break text into sentences
    public String[] getSentences(String fulltext) {
        return sentenceDetector.sentDetect(fulltext);
    }

    /*------------*/
    private void init() {
        try {
            SentenceParser(sentenceDetectorModelName);
            Tokenizer(tokenizerModelName);
        } catch (FileNotFoundException ex) {
            LOGGER.error("File not found", ex);
        }
    }

    private void iterateThroughDocuments(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
        IndexReader reader = searcher.getIndexReader();
        // WARNING: returns null if there are no deletions
        Bits liveDocs = MultiFields.getLiveDocs(reader);

        maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

        if (maxNumDocs == -1) {
            maxNumDocs = reader.maxDoc();
        }
        LOGGER.info("Analyzing docs:\t" + numdocs);

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            if (numdocs > maxNumDocs) {
                break;
            }
            if (liveDocs != null && !liveDocs.get(docID)) {
                continue; // deleted
            }

            if ((docID % 1000) == 0) {
                LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
            }

            StringBuilder text = new StringBuilder();
            for (String field : fields) {
                /*
                 * not sure if this is the best way, might make sense to do a
                 * process text for each field individually, but then book
                 * keeping the doc freq for terms becomes a bit of a pain in the
                 * ass
                 */
                try {
                    IndexableField[] multifield = reader.document(docID).getFields(field);
                    for (IndexableField singlefield : multifield) {
                        // create one big string from all of the text in the
                        // documents for processing later on
                        text.append(". " + singlefield.stringValue());
                    }

                } catch (IOException ex) {
                    LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
                }
            }
            // might as well see if its empty
            if (text.length() > 0) {
                // actually processes the massive string which was created from
                // all of the above fields
                processText(text.toString().toLowerCase());
                numdocs++;
            }
        }

        LOGGER.info("Number of documents analyzed: \t" + numdocs);
        for (int zz = 0; zz < counts.length; zz++) {
            LOGGER.info("Number of " + zz + "-grams: \t" + counts[zz]);
        }
    }

    public SuggesterTreeHolder getSuggester() {
        return suggester;
    }

    SuggeterDataStructureBuilder(SolrIndexSearcher searcher, String[] fields, int ngrams, int minDocFreq,
            int minTermFreq, int maxNumDocs, String nonpruneFileName, List<String> stopWords) {
        NGRAMS = ngrams;
        counts = new int[NGRAMS];
        suggester = new SuggesterTreeHolder(NGRAMS, nonpruneFileName);
        // Solr 4.4 method change
        analyzer = searcher.getCore().getLatestSchema().getAnalyzer();
        // analyzer= searcher.getCore().getSchema().getAnalyzer();
        this.stopwords = new HashSet<String>(stopWords);
        this.fields = fields;
        init();
        iterateThroughDocuments(searcher, fields, maxNumDocs);
        computeNormalizers(minDocFreq, minTermFreq);
    }

    private void processText(String text) {
        LOGGER.trace("Processing text:\t" + text);
        HashSet<String> seenTerms = new HashSet<String>();
        for (String sentence : getSentences(text)) {
            String[] tokens = getTokens(sentence);

            // for each token in the massive string...
            for (int zz = 0; zz < tokens.length; zz++) {
                String localtoken = tokens[zz];

                // TODO: should do a skip gram, but we'll look into that later
                // SBSUGGEST-3
                if (stopwords.contains(localtoken)) {
                    continue;
                }

                // add the string as a possible completion
                TrieNode tokenNode = suggester.AddString(localtoken);
                counts[0]++;

                // and then add one to its doc count
                tokenNode.AddPhraseIncrementCount(localtoken, .1);
                tokenNode.termfreq++;

                /*
                 * if this token havent been seen in this document already, then
                 * add it to this document and increase the document count for
                 * this token
                 */
                if (!seenTerms.contains(localtoken)) {
                    tokenNode.docfreq++;
                    seenTerms.add(localtoken);
                }

                int numterms = 1;
                int yy = zz;
                StringBuilder sb = new StringBuilder();
                sb.append(localtoken);

                /*
                 * building up possible phrases using skip grams. we want to
                 * keep adding tokens while the token is a stop word so that we
                 * can get "republic of ireland" instead of "republic of" or
                 * "of ireland" as suggestions
                 */
                while (true) {
                    yy++;

                    // No other tokens in stream...
                    if (yy >= tokens.length) {
                        break;
                    }
                    String localtoken_2 = tokens[yy];
                    sb.append(" " + localtoken_2);

                    // president of ...
                    if (stopwords.contains(localtoken_2)) {
                        continue;
                    }

                    numterms++;
                    counts[numterms - 1]++;
                    double rightplace = numterms / 10.0;
                    String gram = sb.toString();
                    tokenNode.AddPhraseIncrementCount(gram, rightplace);
                    if (numterms >= NGRAMS) {
                        break;
                    }
                }
            }
        }
    }

    /*
     * since its a probability, we need to normalize overall counts so that
     * they're in the range of 0 to 1
     */
    private void computeNormalizers(int minDocFreq, int minTermFreq) {
        suggester.computeNormalizers(numdocs, minDocFreq, minTermFreq);
    }
}