Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.apache.tika.eval.tokens.TokenCounter.java

License:Apache License

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from  w ww  .  j a  va 2s.c  o  m
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.apache.tika.eval.tokens.TokenCounterTest.java

License:Apache License

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog ";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();/*  ww  w  . j a v  a2  s . co  m*/
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get(""));
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    if (!initialized)
        init();//from  w w  w. j a va2  s .co  m

    if (sortedStreams.size() == 0)
        return false;

    TokenStream currentTokenStream = sortedStreams.pop();

    restoreState(currentTokenStream.captureState());

    OffsetAttribute offsetAttr = (OffsetAttribute) currentTokenStream.getAttribute(OffsetAttribute.class);
    if (offsetAttr.startOffset() == currentOffset)
        posIncAtt.setPositionIncrement(0);
    else
        posIncAtt.setPositionIncrement(1);

    currentOffset = offsetAttr.startOffset();

    // proceed the token stream to its next token and resort the stack
    if (currentTokenStream.incrementToken())
        sortedStreams.add(currentTokenStream);
    rebuildSortedTokens();

    return true;
}

From source file:org.apache.uima.lucas.ProspectiveSearchAE.java

License:Apache License

@Override
public void process(CAS aCAS) throws AnalysisEngineProcessException {

    // First create the index of the document text
    MemoryIndex index = new MemoryIndex();

    List fields = createDocument(aCAS).getFields();

    for (Iterator it = fields.iterator(); it.hasNext();) {
        Field field = (Field) it.next();

        if (field.isIndexed() && field.tokenStreamValue() != null) {
            index.addField(field.name(), field.tokenStreamValue());
        }//w w w . j av a2  s.co m
    }

    // Search all queries against the one document index
    for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {

        float score = index.search(query.query());

        if (score > matchingThreshold) {

            // Add a FS to the CAS with the search result
            FeatureStructure searchResult = aCAS.createFS(searchResultType);
            searchResult.setLongValue(searchResultIdFeature, query.id());
            aCAS.addFsToIndexes(searchResult);

            // Find matching tokens and link their annotations
            // in case the user wants search term highlighting
            if (searchResultMatchingTextFeature != null) {

                fields = createDocument(aCAS).getFields();

                for (Iterator it = fields.iterator(); it.hasNext();) {

                    Field field = (Field) it.next();

                    if (field.isIndexed() && field.tokenStreamValue() != null) {

                        TokenStream tokenStream = field.tokenStreamValue();

                        Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();

                        QueryScorer scorer = new QueryScorer(query.query(), field.name());
                        scorer.startFragment(new TextFragment(new StringBuffer(aCAS.getDocumentText()), 0, 0));

                        try {
                            scorer.init(tokenStream);

                            OffsetAttribute offsetAttr = null;
                            while (tokenStream.incrementToken()) {
                                offsetAttr = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
                                float tokenScore = scorer.getTokenScore();
                                if (tokenScore > 0) {
                                    AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
                                            offsetAttr.startOffset(), offsetAttr.endOffset());

                                    matchingTextAnnotations.add(annotation);
                                }
                            }
                        } catch (IOException e) {
                            throw new AnalysisEngineProcessException(e);
                        }

                        ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());

                        int matchtingTextArrayIndex = 0;
                        for (AnnotationFS matchingTextAnnotation : matchingTextAnnotations) {
                            matchtingTextArray.set(matchtingTextArrayIndex++, matchingTextAnnotation);
                        }

                        searchResult.setFeatureValue(searchResultMatchingTextFeature, matchtingTextArray);
                    }
                }
            }
        }
    }
}

From source file:org.apache.usergrid.utils.IndexUtils.java

License:Apache License

public static List<String> keywords(String source) {
    TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source));
    List<String> keywords = new ArrayList<String>();
    try {/*from  w  ww .ja v a 2 s  .  c  o  m*/
        while (ts.incrementToken()) {
            keywords.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        LOG.error("Error getting keywords ", e);
    }
    return keywords;
}

From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java

License:Open Source License

private static String segText(String text) {
    StringBuilder result = new StringBuilder();
    try {/*from  w w w .  ja va  2 s  . c  om*/
        TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text));
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            result.append(charTermAttribute.toString()).append(" ");
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result.toString();
}

From source file:org.apdplat.word.elasticsearch.ChineseWordIndicesAnalysisTest.java

License:Open Source License

@Test
public void testChineseWordIndicesAnalysis() throws IOException {
    Index index = new Index("test");

    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(SETTINGS),
            new EnvironmentModule(new Environment(SETTINGS)), new IndicesAnalysisModule()).createInjector();

    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, SETTINGS), new IndexNameModule(index),
                    new AnalysisModule(SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class))
                            .addProcessor(new ChineseWordAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    TokenizerFactory tokenizerFactory = analysisService.tokenizer("word");
    boolean match = (tokenizerFactory instanceof ChineseWordTokenizerFactory);
    assertTrue(match);/*from  w  ww  .jav a 2 s .  c o m*/

    Tokenizer tokenizer = tokenizerFactory.create(new StringReader("?"));
    String exp = "[, ?]";
    List<String> result = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
        result.add(charTermAttribute.toString());
    }
    assertEquals(exp, result.toString());

    Analyzer analyzer = analysisService.analyzer("word").analyzer();
    match = (analyzer instanceof ChineseWordAnalyzer);
    assertTrue(match);

    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    exp = "[??, apdplat, , ??, ?, ]";
    result = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        result.add(charTermAttribute.toString());
    }
    assertEquals(exp, result.toString());
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java

License:Open Source License

public static void main(String args[]) throws IOException {
    Analyzer analyzer = new ChineseWordAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    tokenStream.reset();/*  ww w . j  a  va2s  .co  m*/
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text",
            "word????????ysc");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text", "5?");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);

        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test1() {
    try {//from  w w w  .ja v  a 2s . c  om
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text",
                "??APDPlat???");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , apdplat, , ?, ?, ?, , ]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[??, , apdplat, , , ?, ?, ?, , ]";
        }
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test2() {
    try {/*from  w w w .j a v  a 2s  .  c om*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "??");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , , , , , ]";
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}