Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:org.apache.tika.eval.tokens.TokenCounter.java

License:Apache License

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from www. j a  va 2 s .c om
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.apache.tika.eval.tokens.TokenCounterTest.java

License:Apache License

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog ";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//w  w w  . ja  v a2s  .  co m
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get(""));
}

From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java

License:Open Source License

private static String segText(String text) {
    StringBuilder result = new StringBuilder();
    try {/*from w w w .j  ava 2s. c  o m*/
        TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text));
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            result.append(charTermAttribute.toString()).append(" ");
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result.toString();
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java

License:Open Source License

public static void main(String args[]) throws IOException {
    Analyzer analyzer = new ChineseWordAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    tokenStream.reset();/*from   w  ww  .j  a  va 2 s  .  co  m*/
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text",
            "word????????ysc");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text", "5?");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);

        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test1() {
    try {//from   w w  w  .  ja  va2  s.  com
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text",
                "??APDPlat???");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , apdplat, , ?, ?, ?, , ]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[??, , apdplat, , , ?, ?, ?, , ]";
        }
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test2() {
    try {/* w w  w  .j  av a  2s  .c o  m*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "??");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , , , , , ]";
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java

License:Open Source License

private static String analyzeTextToFind(String textToFind) throws IOException {
    // Filter textToFind through GreekAnalyzer
    TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind));
    stream.reset();/*from w w w.  j a  v  a 2  s  .  c  om*/

    StringBuilder analyzedTextTofind = new StringBuilder();

    try {
        while (stream.incrementToken()) {

            String term = stream.getAttribute(TermAttribute.class).term();

            analyzedTextTofind.append(term);
            analyzedTextTofind.append(" ");

        }
    } catch (IOException e) {
        e.printStackTrace();

        analyzedTextTofind.append(textToFind);
    } finally {
        stream.end();
        stream.close();

    }

    String result = analyzedTextTofind.toString().trim();

    if (StringUtils.isBlank(result))
        return textToFind;

    return result;

}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * Analyzes text and return analyzed text
 * @param text/*from   ww  w  .  j av  a 2  s  .c  om*/
 * @return
 * @throws IOException
 */
public static String analyze(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    StringBuilder stBld = new StringBuilder();

    stream.reset();
    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        stBld.append(token).append(" ");
    }
    stream.end();
    stream.close();
    return stBld.toString();
}

From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java

License:Apache License

/** NOTE: this method closes the TokenStream, even on exception, which is awkward
 *  because really the caller who called {Analyzer#tokenStream} should close it,
 *  but when trying that there are recursion issues when we try to use the same
 *  TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
    int numTokens = 0;
    boolean success = false;
    try {//from  w w w.j  ava  2  s .c om
        stream.reset();
        consumer.reset(stream);
        while (stream.incrementToken()) {
            consumer.nextToken();
            numTokens++;
        }
        consumer.end();
        success = true;
    } finally {
        if (success) {
            stream.close();
        } else {
            IOUtils.closeWhileHandlingException(stream);
        }
    }
    return numTokens;
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java

License:Apache License

@Test
public void testNullSynonyms() throws Exception {
    Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1);
    TokenStream stream = a.tokenStream("f", new StringReader(""));
    stream.reset();/*from  w  ww  . j  a  va2  s .  co  m*/
    assertTokenStream(stream, ",0,1,1/,1,2,1/,2,3,1/,3,4,1/,4,5,1/,5,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(2);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,2,1/,1,3,1/,2,4,1/,3,5,1/,4,6,1");
    stream.close();
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,1,1");
    stream.close();
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,2,1");

    a = new NGramSynonymTokenizerTestAnalyzer(3);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,3,1/,1,4,1/,2,5,1/,3,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(4);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,4,1/,1,5,1/,2,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(5);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,5,1/,1,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(6);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(7);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(8);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");
}