Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java

License:Open Source License

public static void main(String args[]) throws IOException {
    Analyzer analyzer = new ChineseWordAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("text",
            "??APDPlat???");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }//from   w w w.j a v a 2 s. c o m
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text",
            "word????????ysc");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);
        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();

    tokenStream = analyzer.tokenStream("text", "5?");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream
                .getAttribute(PositionIncrementAttribute.class);

        LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - "
                + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement());
    }
    tokenStream.close();
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test1() {
    try {//from  w w w  .  j av  a2 s  .  com
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text",
                "??APDPlat???");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , apdplat, , ?, ?, ?, , ]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[??, , apdplat, , , ?, ?, ?, , ]";
        }
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java

License:Open Source License

@Test
public void test2() {
    try {/*from   w  w w. j av  a 2 s.co m*/
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "??");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[??, , , , , , ]";
        assertEquals(expResult, words.toString());
    } catch (IOException e) {
        fail("?" + e.getMessage());
    }
}

From source file:org.archive.porky.TokenizeTextUDF.java

License:Apache License

public String exec(Tuple input) throws IOException {

    String emptyString = "";
    if (input == null || input.size() == 0) {
        return emptyString;
    }//from w  w w .ja  va 2  s .co m
    try {
        String textString = (String) input.get(0);
        if (textString == null) {
            return emptyString;
        }
        if (stopSet == null) {
            //initialize
            List<String> stopWords = new ArrayList<String>();
            //read in stop words file
            // Open the file as a local file.
            FileReader fr = new FileReader(stopWordsFile);
            BufferedReader d = new BufferedReader(fr);
            String line;
            while ((line = d.readLine()) != null) {
                stopWords.add(line);
            }
            fr.close();
            stopSet = new CharArraySet(Version.LUCENE_45, stopWords, true);
        }

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_45, new StringReader(textString));
        tokenStream = new StopFilter(Version.LUCENE_45, tokenStream, stopSet);
        StringBuilder sb = new StringBuilder();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
        return sb.toString();

    } catch (Exception e) {
        return emptyString;
    }
}

From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java

License:Open Source License

private static String analyzeTextToFind(String textToFind) throws IOException {
    // Filter textToFind through GreekAnalyzer
    TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind));
    stream.reset();

    StringBuilder analyzedTextTofind = new StringBuilder();

    try {/*from  w  ww .j  a  v  a2s. c o m*/
        while (stream.incrementToken()) {

            String term = stream.getAttribute(TermAttribute.class).term();

            analyzedTextTofind.append(term);
            analyzedTextTofind.append(" ");

        }
    } catch (IOException e) {
        e.printStackTrace();

        analyzedTextTofind.append(textToFind);
    } finally {
        stream.end();
        stream.close();

    }

    String result = analyzedTextTofind.toString().trim();

    if (StringUtils.isBlank(result))
        return textToFind;

    return result;

}

From source file:org.bibsonomy.lucene.search.LuceneResourceSearch.java

License:Open Source License

/** 
 * analyzes given input parameter/*w  w w.  jav a 2  s  .co m*/
 * 
 * @param fieldName the name of the field
 * @param param the value of the field
 * @return the analyzed string
 * @throws IOException
 */
protected String parseToken(final String fieldName, final String param) throws IOException {
    if (present(param)) {
        // use lucene's new token stream api (see org.apache.lucene.analysis' javadoc at package level)
        final TokenStream ts = this.getAnalyzer().tokenStream(fieldName, new StringReader(param));
        final TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
        ts.reset();

        // analyze the parameter - that is: concatenate its normalized tokens
        final StringBuilder analyzedString = new StringBuilder();
        while (ts.incrementToken()) {
            analyzedString.append(" ").append(termAtt.term());
        }

        return analyzedString.toString().trim();
    }

    return "";
}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * Analyzes text and return analyzed text
 * @param text/*from w  w w .  j a  v  a 2 s. co m*/
 * @return
 * @throws IOException
 */
public static String analyze(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    StringBuilder stBld = new StringBuilder();

    stream.reset();
    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        stBld.append(token).append(" ");
    }
    stream.end();
    stream.close();
    return stBld.toString();
}

From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java

License:Apache License

/** NOTE: this method closes the TokenStream, even on exception, which is awkward
 *  because really the caller who called {Analyzer#tokenStream} should close it,
 *  but when trying that there are recursion issues when we try to use the same
 *  TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
    int numTokens = 0;
    boolean success = false;
    try {//from w w w  .  ja  v a  2  s.com
        stream.reset();
        consumer.reset(stream);
        while (stream.incrementToken()) {
            consumer.nextToken();
            numTokens++;
        }
        consumer.end();
        success = true;
    } finally {
        if (success) {
            stream.close();
        } else {
            IOUtils.closeWhileHandlingException(stream);
        }
    }
    return numTokens;
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java

License:Apache License

@Test
public void testNullSynonyms() throws Exception {
    Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1);
    TokenStream stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,1,1/,1,2,1/,2,3,1/,3,4,1/,4,5,1/,5,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(2);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();//ww w. jav  a 2s  . co m
    assertTokenStream(stream, ",0,2,1/,1,3,1/,2,4,1/,3,5,1/,4,6,1");
    stream.close();
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,1,1");
    stream.close();
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,2,1");

    a = new NGramSynonymTokenizerTestAnalyzer(3);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,3,1/,1,4,1/,2,5,1/,3,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(4);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,4,1/,1,5,1/,2,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(5);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,5,1/,1,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(6);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(7);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(8);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java

License:Apache License

@Test
public void testSingleSynonymIgnoreCase() throws Exception {
    Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, false, "A,AA,AAA");
    TokenStream stream = a.tokenStream("f", new StringReader("aaa"));
    stream.reset();
    assertTokenStream(stream, "aaa,0,3,1");
}