Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.uima.lucas.ProspectiveSearchAE.java

License:Apache License

@Override
public void process(CAS aCAS) throws AnalysisEngineProcessException {

    // First create the index of the document text
    MemoryIndex index = new MemoryIndex();

    List fields = createDocument(aCAS).getFields();

    for (Iterator it = fields.iterator(); it.hasNext();) {
        Field field = (Field) it.next();

        if (field.isIndexed() && field.tokenStreamValue() != null) {
            index.addField(field.name(), field.tokenStreamValue());
        }/*from   w w  w. ja v  a  2s .  c  o  m*/
    }

    // Search all queries against the one document index
    for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {

        float score = index.search(query.query());

        if (score > matchingThreshold) {

            // Add a FS to the CAS with the search result
            FeatureStructure searchResult = aCAS.createFS(searchResultType);
            searchResult.setLongValue(searchResultIdFeature, query.id());
            aCAS.addFsToIndexes(searchResult);

            // Find matching tokens and link their annotations
            // in case the user wants search term highlighting
            if (searchResultMatchingTextFeature != null) {

                fields = createDocument(aCAS).getFields();

                for (Iterator it = fields.iterator(); it.hasNext();) {

                    Field field = (Field) it.next();

                    if (field.isIndexed() && field.tokenStreamValue() != null) {

                        TokenStream tokenStream = field.tokenStreamValue();

                        Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();

                        QueryScorer scorer = new QueryScorer(query.query(), field.name());
                        scorer.startFragment(new TextFragment(new StringBuffer(aCAS.getDocumentText()), 0, 0));

                        try {
                            scorer.init(tokenStream);

                            OffsetAttribute offsetAttr = null;
                            while (tokenStream.incrementToken()) {
                                offsetAttr = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
                                float tokenScore = scorer.getTokenScore();
                                if (tokenScore > 0) {
                                    AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
                                            offsetAttr.startOffset(), offsetAttr.endOffset());

                                    matchingTextAnnotations.add(annotation);
                                }
                            }
                        } catch (IOException e) {
                            throw new AnalysisEngineProcessException(e);
                        }

                        ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());

                        int matchtingTextArrayIndex = 0;
                        for (AnnotationFS matchingTextAnnotation : matchingTextAnnotations) {
                            matchtingTextArray.set(matchtingTextArrayIndex++, matchingTextAnnotation);
                        }

                        searchResult.setFeatureValue(searchResultMatchingTextFeature, matchtingTextArray);
                    }
                }
            }
        }
    }
}

From source file:org.apache.usergrid.utils.IndexUtils.java

License:Apache License

public static List<String> keywords(String source) {
    TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source));
    List<String> keywords = new ArrayList<String>();
    try {// www . j  a va 2s  .c o  m
        while (ts.incrementToken()) {
            keywords.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        LOG.error("Error getting keywords ", e);
    }
    return keywords;
}

From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java

License:Open Source License

private static String segText(String text) {
    StringBuilder result = new StringBuilder();
    try {/*from  w w w . j a  v  a  2 s .c o m*/
        TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text));
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            result.append(charTermAttribute.toString()).append(" ");
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result.toString();
}

From source file:org.apdplat.word.elasticsearch.ChineseWordIndicesAnalysisTest.java

License:Open Source License

@Test
public void testChineseWordIndicesAnalysis() throws IOException {
    Index index = new Index("test");

    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(SETTINGS),
            new EnvironmentModule(new Environment(SETTINGS)), new IndicesAnalysisModule()).createInjector();

    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, SETTINGS), new IndexNameModule(index),
                    new AnalysisModule(SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class))
                            .addProcessor(new ChineseWordAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    TokenizerFactory tokenizerFactory = analysisService.tokenizer("word");
    boolean match = (tokenizerFactory instanceof ChineseWordTokenizerFactory);
    assertTrue(match);/* w w w.  java  2  s .c o  m*/

    Tokenizer tokenizer = tokenizerFactory.create(new StringReader("?"));
    String exp = "[, ?]";
    List<String> result = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
        result.add(charTermAttribute.toString());
    }
    assertEquals(exp, result.toString());

    Analyzer analyzer = analysisService.analyzer("word").analyzer();
    match = (analyzer instanceof ChineseWordAnalyzer);
    assertTrue(match);

    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    exp = "[??, apdplat, , ??, ?, ]";
    result = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        result.add(charTermAttribute.toString());
    }
    assertEquals(exp, result.toString());
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java

License:Open Source License

public static void main(String args[]) throws IOException {
    Analyzer analyzer = new ChineseWordAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    tokenStream.reset();//from   ww w  . j a va  2  s .  c o m
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text",
            "word????????ysc");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text", "5?");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);

        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test1() {
    try {/*from   w w w.  j  ava  2  s .c  o m*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text",
                "??APDPlat???");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , apdplat, , ?, ?, ?, , ]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[??, , apdplat, , , ?, ?, ?, , ]";
        }
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test2() {
    try {/*ww w .ja  v  a  2 s . co  m*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "??");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , , , , , ]";
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.archive.porky.TokenizeTextUDF.java

License:Apache License

public String exec(Tuple input) throws IOException {

    String emptyString = "";
    if (input == null || input.size() == 0) {
        return emptyString;
    }/*from w w  w. ja va  2  s .c  om*/
    try {
        String textString = (String) input.get(0);
        if (textString == null) {
            return emptyString;
        }
        if (stopSet == null) {
            //initialize
            List<String> stopWords = new ArrayList<String>();
            //read in stop words file
            // Open the file as a local file.
            FileReader fr = new FileReader(stopWordsFile);
            BufferedReader d = new BufferedReader(fr);
            String line;
            while ((line = d.readLine()) != null) {
                stopWords.add(line);
            }
            fr.close();
            stopSet = new CharArraySet(Version.LUCENE_45, stopWords, true);
        }

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_45, new StringReader(textString));
        tokenStream = new StopFilter(Version.LUCENE_45, tokenStream, stopSet);
        StringBuilder sb = new StringBuilder();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
        return sb.toString();

    } catch (Exception e) {
        return emptyString;
    }
}

From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java

License:Open Source License

private static String analyzeTextToFind(String textToFind) throws IOException {
    // Filter textToFind through GreekAnalyzer
    TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind));
    stream.reset();//  ww  w.  ja  va 2s  .com

    StringBuilder analyzedTextTofind = new StringBuilder();

    try {
        while (stream.incrementToken()) {

            String term = stream.getAttribute(TermAttribute.class).term();

            analyzedTextTofind.append(term);
            analyzedTextTofind.append(" ");

        }
    } catch (IOException e) {
        e.printStackTrace();

        analyzedTextTofind.append(textToFind);
    } finally {
        stream.end();
        stream.close();

    }

    String result = analyzedTextTofind.toString().trim();

    if (StringUtils.isBlank(result))
        return textToFind;

    return result;

}

From source file:org.bibsonomy.lucene.search.LuceneResourceSearch.java

License:Open Source License

/** 
 * analyzes given input parameter/*  ww  w .jav a 2 s .com*/
 * 
 * @param fieldName the name of the field
 * @param param the value of the field
 * @return the analyzed string
 * @throws IOException
 */
protected String parseToken(final String fieldName, final String param) throws IOException {
    if (present(param)) {
        // use lucene's new token stream api (see org.apache.lucene.analysis' javadoc at package level)
        final TokenStream ts = this.getAnalyzer().tokenStream(fieldName, new StringReader(param));
        final TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
        ts.reset();

        // analyze the parameter - that is: concatenate its normalized tokens
        final StringBuilder analyzedString = new StringBuilder();
        while (ts.incrementToken()) {
            analyzedString.append(" ").append(termAtt.term());
        }

        return analyzedString.toString().trim();
    }

    return "";
}