Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:com.faqit.similarity.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text. Can handle ngrams of any length
 * and also perform stop word removal before extraction
 * //from w w  w.j  a  va2s .  co m
 * @param text
 *            the text that the ngrams should be extracted from
 * @param length
 *            the length of the ngrams
 * @param stopWords
 *            whether or not stopwords should be removed before extraction
 * @param overlap
 *            whether or not the ngrams should overlap
 */
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /*
     * If the minLength and maxLength are both 1, then we want unigrams Make
     * use of a StopAnalyzer when stopwords should be removed Make use of a
     * SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer();
        } else {
            analyzer = new SimpleAnalyzer();
        }
    } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once
             // again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a
            // hack to use
            // Lucene 2.4
            // since in 2.4
            // position
            // increments
            // weren't
            // preserved by
            // default.
            // Using a later
            // version puts
            // underscores
            // (_) in the
            // place of
            // removed stop
            // words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, "");
        }
    }

    // Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    // OffsetAttribute offsetAttribute =
    // tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    // int tokenCount = 0;
    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); // The actual token
        // term
        nGrams.add(termToken); // Add all ngrams to the ngram LinkedList

        // If n-grams are not allowed to overlap, then increment to point of
        // no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    // Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) {
        return null;
    }/*from w w w.  ja v a 2 s .c o m*/

    DataBag bagOfTokens = bagFactory.newDefaultBag();
    TokenStream tokenStream = null;
    try {
        String lineOfText = input.get(0).toString();
        StringReader textInput = new StringReader(lineOfText);
        tokenStream = analyzer.tokenStream(noField, textInput);
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            Tuple termText = tupleFactory.newTuple(termAttribute.toString());
            bagOfTokens.add(termText);
            termAttribute.setEmpty();
        }
    } finally {
        if (tokenStream != null) {
            tokenStream.close();
        }
    }
    return bagOfTokens;
}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

private String analyzeQuery(String query, Analyzer analyzer) {

    if (analyzer != null && query != null && query.length() > 0) {
        TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query));

        StringBuilder newQueryB = new StringBuilder();
        try {//  w w  w .  j a v  a 2s. c om
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
                // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
                // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

                newQueryB.append(term.toString());
                newQueryB.append(' ');

            }
            tokenStream.end();
            return newQueryB.toString().trim();

        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        } finally {
            try {
                tokenStream.close();
            } catch (IOException e) {
                throw new RuntimeException("uncaught exception in synonym processing", e);
            }
        }
    }

    return query;

}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * /*from   w ww.jav  a  2 s.com*/
 * @param synonymAnalyzer
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    String origQuery = getQueryStringFromParser();
    int queryLen = origQuery.length();

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(origQuery));

    SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

    boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false);

    boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false);
    List<String> synonymBag = new ArrayList<>();

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                String termToAdd = term.toString();

                if (typeAttribute.type().equals("SYNONYM")) {
                    synonymBag.add(termToAdd);
                }

                // Don't quote sibgle term term synonyms
                if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")
                        && termToAdd.contains(" ")) {
                    // Don't Quote when original is already surrounded by quotes
                    if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen
                            || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"'
                            || origQuery.charAt(offsetAttribute.endOffset()) != '"') {
                        // make a phrase out of the synonym
                        termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString();
                    }
                }
                if (!bag) {
                    // create a graph of all possible synonym combinations,
                    // e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
                    TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(),
                            offsetAttribute.endOffset());

                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery);
                }
            }
        }
        tokenStream.end();
    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        }
    }

    List<String> alternateQueries = synonymBag;

    if (!bag) {
        // use a graph rather than a bag
        List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size());
        sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new)
                .collect(Collectors.toList()));

        // have to use the start positions and end positions to figure out all possible combinations
        alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery);
    }

    // save for debugging purposes
    expandedSynonyms = alternateQueries;

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }//from  w w  w  .  j  a va  2  s. com
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException {
    ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>();
    HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>();
    addAttributes(tokenStream);//ww  w .j av  a 2  s.  c  o m
    tokenStream.reset();

    while (tokenStream.incrementToken()) {
        if (tokenStream.hasAttributes()) {
            LuceneToken token = new LuceneToken();

            readOffset(tokenStream, token);

            // Lucene may output multiple tokens for compound words
            LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset());
            if (tokenWithSameStartOffset != null) {
                if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) {
                    continue;
                } else {
                    tokens.remove(tokenWithSameStartOffset);
                }
            }

            readReading(tokenStream, token);
            readPartOfSpeech(tokenStream, token);
            readInflection(tokenStream, token);
            readBaseForm(tokenStream, token);

            tokensByStartOffset.put(token.getStartOffset(), token);
            tokens.add(token);
        }
    }

    tokenStream.end();
    tokenStream.close();
    return tokens;
}

From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java

License:Apache License

private String[] analyze(final String analyzerName, final String text) throws Exception {
    final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName);
    final TokenStream stream = analyzer.tokenStream("default", new StringReader(text));
    stream.reset();
    final List<String> result = new ArrayList<String>();
    while (stream.incrementToken()) {
        final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class);
        result.add(c.toString());/*  w w  w . j  ava2s.c  om*/
    }
    return result.toArray(new String[0]);
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);//w  ww. j  ava 2  s . c  o  m
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.globalsight.ling.lucene.Index.java

License:Apache License

/**
 * Executes a search in the index returning no more than p_maxHits
 * (suggested: 5-10), and having no score smaller than p_minScore.
 *
 * This implementation is based on Lucene and Lucene score values
 * float widely, making it hard to specify a useful cut-off like
 * 0.7 or 0.5. Good scores can be < 0.2. All that is guaranteed is
 * that scores are numerically ordered. Use p_maxHits instead.
 *//*from  w w w  . j  a v  a 2s .c  om*/
public Hits search(String p_text, int end, int begin, float p_minScore)
        throws IOException, InterruptedException {
    synchronized (m_state) {
        if (m_state != STATE_OPENED) {
            throw new IOException("index is not available");
        }
    }

    try {
        m_lock.readLock().acquire();

        try {
            // Search the current index.
            //IndexReader reader = DirectoryReader.open(m_fsDir);
            //IndexSearcher searcher = new IndexSearcher(reader);
            IndexSearcher searcher = LuceneCache.getLuceneCache(m_directory).getIndexSearcher();

            Query query = getQuery(p_text);

            int maxHits = end - begin;
            TopDocs topDocs = searcher.search(query, maxHits);

            if (topDocs.totalHits > 0) {
                noResult = false;
            }

            // Store results in our own object.
            Hits result = new Hits(searcher, topDocs.scoreDocs, end, begin, p_minScore, p_text);

            // Highlight query terms in long results.
            if (m_type == TYPE_TEXT) {
                // Note: rewrite MultiTermQuery, RangeQuery or PrefixQuery.

                // TODO: optimize object creation if it all works.
                Highlighter highlighter = new Highlighter(new SimpleFormatter(), new QueryScorer(query));

                int max = Math.min(end, topDocs.totalHits);
                for (int i = begin; i < max; i++) {
                    Document doc = searcher.doc(topDocs.scoreDocs[i].doc);
                    String text = doc.get(IndexDocument.TEXT);

                    TokenStream tokenStream = m_analyzer.tokenStream(IndexDocument.TEXT,
                            new StringReader(text));
                    tokenStream.reset();

                    // Get 3 best fragments and separate with "..."
                    String hilite = highlighter.getBestFragments(tokenStream, text, 3, "...");

                    result.getHit(i).setText(hilite);
                }
            }

            //searcher.close();
            // reader.close();

            return result;
        } finally {
            m_lock.readLock().release();
        }
    } catch (InterruptedException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:com.globalsight.ling.lucene.TbFuzzyIndex.java

License:Apache License

protected Query getQuery(String p_text) throws IOException {
    BooleanQuery result = new BooleanQuery();

    if (AnalyzerFactory.TOKENIZE_3GRAM == m_tokenize) {
        m_analyzer = AnalyzerFactory.getInstance(getLocale(), m_tokenize);
    }//from   ww  w.  j a v  a  2s .  com

    TokenStream tokens = m_analyzer.tokenStream(IndexDocument.TEXT, new StringReader(p_text));
    tokens.reset();

    Token t;
    while ((t = LuceneUtil.getNextToken(tokens)) != null) {
        result.add(new BooleanClause(new TermQuery(new Term(IndexDocument.TEXT, t.toString())), Occur.SHOULD));
    }

    return result;
}