Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer general = analyzerManager.getGeneralAnalyzer();
    TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
    ts.reset();

    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        seen.add(termAtt.toString());//from   w ww  .j  a va  2s .c o  m
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("the"));
    assertTrue(seen.contains("and"));
    assertTrue(seen.contains("dog"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testCommon() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer common = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
            fail("Shouldn't have found a numeric");
        }/*ww w.  ja  va 2  s  .  c  o  m*/
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("dirty"));
    assertFalse(seen.contains("the"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testTokenCountFilter() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 101000; i++) {
        sb.append("the ");
    }//from  w w w .j  a v a  2s. c o m
    TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
    }

    assertEquals(100000, tokens);

}

From source file:org.apache.tika.eval.tokens.TokenCounter.java

License:Apache License

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);//  w w w  .  j av a2  s  .c  o  m
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.apache.tika.eval.tokens.TokenCounterTest.java

License:Apache License

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog ";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;//ww  w.  ja v a2 s .c o  m
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get(""));
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamConcatenator.java

License:Apache License

public void reset() throws IOException {
    for (TokenStream tokenStream : tokenStreams)
        tokenStream.reset();

    tokenStreamIterator = tokenStreams.iterator();

    if (tokenStreamIterator.hasNext())
        currentTokenStream = tokenStreamIterator.next();
    else/*from  www.  j a v  a 2 s .  c o  m*/
        currentTokenStream = null;
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

private void init() throws IOException {
    for (TokenStream stream : streams) {
        stream.reset();
        stream.incrementToken();//from w  ww.j  a v  a 2s.c o m
        sortedStreams.add(stream);
    }
    rebuildSortedTokens();
    initialized = true;
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

public void reset() throws IOException {
    for (TokenStream stream : streams)
        stream.reset();

    currentOffset = -1;//  ww w  .j av  a  2 s.c  o  m
    sortedStreams.clear();
    initialized = false;
}

From source file:org.apache.uima.lucas.indexer.FieldBuilder.java

License:Apache License

protected Collection<Field> createStoredFields(String fieldName, TokenStream tokenStream, Store fieldStore,
        String delimiter, Boolean unique, Index fieldIndex, Boolean omitTF) throws FieldBuildingException {
    Collection<Field> fields = new ArrayList<Field>();
    // Only do indexing if we need a unique, indexed AND stored field.
    Index index = unique ? fieldIndex : Field.Index.NO;
    try {//from ww w. ja  va 2  s  . c  o  m
        Field field;
        if (delimiter != null) {
            String value = tokenStreamStringConcatenator.tokenStreamToStringWithDelimiter(tokenStream,
                    delimiter);
            field = new Field(fieldName, value, fieldStore, index);
            if (unique)
                field.setOmitTermFreqAndPositions(omitTF);
            fields.add(field);
        } else {
            Token nextToken = tokenStream.next(new Token());
            while (nextToken != null) {
                String value = new String(nextToken.termBuffer(), 0, nextToken.termLength());
                field = new Field(fieldName, value, fieldStore, index);
                if (unique)
                    field.setOmitTermFreqAndPositions(omitTF);
                fields.add(field);
                nextToken = tokenStream.next(nextToken);
            }
        }
        tokenStream.reset();
    } catch (IOException e) {
        throw createException(e);
    }
    return fields;
}

From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java

License:Open Source License

private static String segText(String text) {
    StringBuilder result = new StringBuilder();
    try {/* w  w w . ja va 2  s. c om*/
        TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text));
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            result.append(charTermAttribute.toString()).append(" ");
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result.toString();
}