Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.uima.lucas.ProspectiveSearchAE.java

License:Apache License

@Override
public void process(CAS aCAS) throws AnalysisEngineProcessException {

    // First create the index of the document text
    MemoryIndex index = new MemoryIndex();

    List fields = createDocument(aCAS).getFields();

    for (Iterator it = fields.iterator(); it.hasNext();) {
        Field field = (Field) it.next();

        if (field.isIndexed() && field.tokenStreamValue() != null) {
            index.addField(field.name(), field.tokenStreamValue());
        }/*from   w w  w. ja v  a  2s .  c  o  m*/
    }

    // Search all queries against the one document index
    for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {

        float score = index.search(query.query());

        if (score > matchingThreshold) {

            // Add a FS to the CAS with the search result
            FeatureStructure searchResult = aCAS.createFS(searchResultType);
            searchResult.setLongValue(searchResultIdFeature, query.id());
            aCAS.addFsToIndexes(searchResult);

            // Find matching tokens and link their annotations
            // in case the user wants search term highlighting
            if (searchResultMatchingTextFeature != null) {

                fields = createDocument(aCAS).getFields();

                for (Iterator it = fields.iterator(); it.hasNext();) {

                    Field field = (Field) it.next();

                    if (field.isIndexed() && field.tokenStreamValue() != null) {

                        TokenStream tokenStream = field.tokenStreamValue();

                        Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();

                        QueryScorer scorer = new QueryScorer(query.query(), field.name());
                        scorer.startFragment(new TextFragment(new StringBuffer(aCAS.getDocumentText()), 0, 0));

                        try {
                            scorer.init(tokenStream);

                            OffsetAttribute offsetAttr = null;
                            while (tokenStream.incrementToken()) {
                                offsetAttr = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
                                float tokenScore = scorer.getTokenScore();
                                if (tokenScore > 0) {
                                    AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
                                            offsetAttr.startOffset(), offsetAttr.endOffset());

                                    matchingTextAnnotations.add(annotation);
                                }
                            }
                        } catch (IOException e) {
                            throw new AnalysisEngineProcessException(e);
                        }

                        ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());

                        int matchtingTextArrayIndex = 0;
                        for (AnnotationFS matchingTextAnnotation : matchingTextAnnotations) {
                            matchtingTextArray.set(matchtingTextArrayIndex++, matchingTextAnnotation);
                        }

                        searchResult.setFeatureValue(searchResultMatchingTextFeature, matchtingTextArray);
                    }
                }
            }
        }
    }
}

From source file:org.apache.usergrid.utils.IndexUtils.java

License:Apache License

public static List<String> keywords(String source) {
    TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source));
    List<String> keywords = new ArrayList<String>();
    try {// www . j  a va 2s  .c o  m
        while (ts.incrementToken()) {
            keywords.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        LOG.error("Error getting keywords ", e);
    }
    return keywords;
}

From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java

License:Open Source License

private static String segText(String text) {
    StringBuilder result = new StringBuilder();
    try {/*from  w w w . j a  v  a  2 s .c o m*/
        TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text));
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            result.append(charTermAttribute.toString()).append(" ");
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result.toString();
}

From source file:org.apdplat.word.elasticsearch.ChineseWordIndicesAnalysisTest.java

License:Open Source License

@Test
public void testChineseWordIndicesAnalysis() throws IOException {
    Index index = new Index("test");

    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(SETTINGS),
            new EnvironmentModule(new Environment(SETTINGS)), new IndicesAnalysisModule()).createInjector();

    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, SETTINGS), new IndexNameModule(index),
                    new AnalysisModule(SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class))
                            .addProcessor(new ChineseWordAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    TokenizerFactory tokenizerFactory = analysisService.tokenizer("word");
    boolean match = (tokenizerFactory instanceof ChineseWordTokenizerFactory);
    assertTrue(match);/* w w w.  java  2  s .c o  m*/

    Tokenizer tokenizer = tokenizerFactory.create(new StringReader("?"));
    String exp = "[, ?]";
    List<String> result = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
        result.add(charTermAttribute.toString());
    }
    assertEquals(exp, result.toString());

    Analyzer analyzer = analysisService.analyzer("word").analyzer();
    match = (analyzer instanceof ChineseWordAnalyzer);
    assertTrue(match);

    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    exp = "[??, apdplat, , ??, ?, ]";
    result = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        result.add(charTermAttribute.toString());
    }
    assertEquals(exp, result.toString());
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java

License:Open Source License

public static void main(String args[]) throws IOException {
    Analyzer analyzer = new ChineseWordAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    tokenStream.reset();//from   ww w  . j a va  2  s .  c o m
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text",
            "word????????ysc");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text", "5?");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);

        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test1() {
    try {/*from   w w w.  j  ava  2  s .c  o m*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text",
                "??APDPlat???");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , apdplat, , ?, ?, ?, , ]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[??, , apdplat, , , ?, ?, ?, , ]";
        }
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test2() {
    try {/*ww w .ja  v  a  2 s . co  m*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "??");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , , , , , ]";
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.archive.porky.TokenizeTextUDF.java

License:Apache License

public String exec(Tuple input) throws IOException {

    String emptyString = "";
    if (input == null || input.size() == 0) {
        return emptyString;
    }/*from w w  w. ja va  2  s .c  om*/
    try {
        String textString = (String) input.get(0);
        if (textString == null) {
            return emptyString;
        }
        if (stopSet == null) {
            //initialize
            List<String> stopWords = new ArrayList<String>();
            //read in stop words file
            // Open the file as a local file.
            FileReader fr = new FileReader(stopWordsFile);
            BufferedReader d = new BufferedReader(fr);
            String line;
            while ((line = d.readLine()) != null) {
                stopWords.add(line);
            }
            fr.close();
            stopSet = new CharArraySet(Version.LUCENE_45, stopWords, true);
        }

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_45, new StringReader(textString));
        tokenStream = new StopFilter(Version.LUCENE_45, tokenStream, stopSet);
        StringBuilder sb = new StringBuilder();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
        return sb.toString();

    } catch (Exception e) {
        return emptyString;
    }
}

From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java

License:Open Source License

private static String analyzeTextToFind(String textToFind) throws IOException {
    // Filter textToFind through GreekAnalyzer
    TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind));
    stream.reset();//  ww  w.  ja  va 2s  .com

    StringBuilder analyzedTextTofind = new StringBuilder();

    try {
        while (stream.incrementToken()) {

            String term = stream.getAttribute(TermAttribute.class).term();

            analyzedTextTofind.append(term);
            analyzedTextTofind.append(" ");

        }
    } catch (IOException e) {
        e.printStackTrace();

        analyzedTextTofind.append(textToFind);
    } finally {
        stream.end();
        stream.close();

    }

    String result = analyzedTextTofind.toString().trim();

    if (StringUtils.isBlank(result))
        return textToFind;

    return result;

}

From source file:org.bibsonomy.lucene.search.LuceneResourceSearch.java

License:Open Source License

/** 
 * analyzes given input parameter/*  ww  w .jav a 2 s .com*/
 * 
 * @param fieldName the name of the field
 * @param param the value of the field
 * @return the analyzed string
 * @throws IOException
 */
protected String parseToken(final String fieldName, final String param) throws IOException {
    if (present(param)) {
        // use lucene's new token stream api (see org.apache.lucene.analysis' javadoc at package level)
        final TokenStream ts = this.getAnalyzer().tokenStream(fieldName, new StringReader(param));
        final TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
        ts.reset();

        // analyze the parameter - that is: concatenate its normalized tokens
        final StringBuilder analyzedString = new StringBuilder();
        while (ts.incrementToken()) {
            analyzedString.append(" ").append(termAtt.term());
        }

        return analyzedString.toString().trim();
    }

    return "";
}