Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.splevo.vpm.analyzer.semantic.lucene.LuceneCodeAnalyzer.java

License:Open Source License

/**
 * Stem a list of words with a configured stemmer.
 *
 * @param words//ww  w  . jav  a 2  s . co  m
 *            The list of words to stem.
 * @param stemming
 *            The stemmer to be used.
 * @return The stemmed list of words.
 */
@SuppressWarnings("resource")
public static String[] stemWords(String[] words, Stemming stemming) {
    Set<String> stemmedStopWords = Sets.newHashSet();

    for (String word : words) {
        TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word));
        tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming);

        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        try {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                stemmedStopWords.add(term);
            }
        } catch (IOException e) {
            logger.error("Failed to stem a list of words", e);
        }
    }
    return stemmedStopWords.toArray(new String[] {});
}

From source file:org.talend.dataquality.standardization.index.SynonymIndexSearcher.java

License:Open Source License

private List<String> getTokensFromAnalyzer(String input) throws IOException {
    StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(input));
    TokenStream result = new StandardFilter(tokenStream);
    result = new LowerCaseFilter(result);
    TermAttribute termAttribute = result.addAttribute(TermAttribute.class);

    List<String> termList = new ArrayList<String>();
    while (result.incrementToken()) {
        String term = termAttribute.term();
        termList.add(term);/*from  www .j a v a2s .  com*/
    }
    return termList;
}

From source file:org.thiesen.jiffs.jobs.preprocessor.Preprocessor.java

License:Open Source License

private void preprocess(StoryDBO story, Analyzer analyzer) {
    final String cleanedText = new HtmlStripper().stripHtml(story.getFullText());

    try {/*from   w  w w  .java  2 s . c  o m*/
        final TokenStream tokenStream = analyzer.reusableTokenStream("dummy", new StringReader(cleanedText));
        final TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);

        final Collection<String> tokens = Sets.newHashSet();
        while (tokenStream.incrementToken()) {
            final String token = termAtt.term();

            if (StringUtils.isNotBlank(token)) {
                tokens.add(token);
            }
        }

        final String tokenString = Joiner.on(',').join(tokens);

        story.setPreprocessedText(tokenString);

        _storyDAO.update(story);
    } catch (IOException e) {
        throw new RuntimeException("IOException on in memory operation ", e);
    }
}

From source file:org.tightblog.service.indexer.AbstractTask.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 *
 * @param field The lucene document field to create a term with
 * @param input The input you wish to convert into a term
 * @return Lucene search term//  ww  w. ja  va2s .c o m
 */
Term getTerm(String field, String input) {
    Term term = null;

    if (input != null && field != null) {
        try (Analyzer analyzer = manager.getAnalyzer()) {
            if (analyzer != null) {
                try {
                    TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
                    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
                    tokens.reset();

                    if (tokens.incrementToken()) {
                        String termt = termAtt.toString();
                        term = new Term(field, termt);
                    }
                } catch (IOException e) {
                    // ignored
                }
            }
        }
    }
    return term;
}

From source file:org.watermint.sourcecolon.org.opensolaris.opengrok.search.Summarizer.java

License:Apache License

private Token[] getTokens(String text) throws IOException {
    //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
    //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
    ArrayList<Token> result = new ArrayList<>();
    TokenStream ts = analyzer.tokenStream("full", new StringReader(text));
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    while (ts.incrementToken()) {
        Token t = new Token(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
        result.add(t);//ww  w.j  a  va 2  s. c  o  m
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.weborganic.flint.util.Fields.java

License:artistic-license-2.0

/**
 * Returns the terms for a field//w w w.ja va 2  s . c  o  m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
public static List<String> toTerms(String field, String text, Analyzer analyzer) {
    StringReader r = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(field, r);
    PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    List<String> terms = new ArrayList<String>();
    try {
        stream.reset();
        while (stream.incrementToken()) {
            String term = attribute.term();
            terms.add(term);
            // TODO Use increment for the phrase query
            //        System.err.println(term+":"+increment.getPositionIncrement());
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
    return terms;
}

From source file:org.weborganic.flint.util.Queries.java

License:artistic-license-2.0

/**
 * Returns the terms for a field//ww  w .  j a v a2s.  c o  m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) {
    StringReader r = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(field, r);
    PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    try {
        int position = -1;
        stream.reset();
        while (stream.incrementToken()) {
            position += increment.getPositionIncrement();
            Term term = new Term(field, attribute.term());
            phrase.add(term, position);
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
}

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java

License:Apache License

public static ArrayList<String> getTopicWord(String str) {
     // IK?smart??
     Analyzer analyzer = new IKAnalyzer(true);
     ArrayList<String> retData = new ArrayList<String>();
     // ?LuceneTokenStream
     TokenStream ts = null;
     try {/*from   w  w w. ja  v a2s  . c o  m*/
         ts = analyzer.tokenStream("myfield", new StringReader(str));
         // ???
         OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
         // ??
         CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
         // ??
         TypeAttribute type = ts.addAttribute(TypeAttribute.class);

         // ?TokenStream?StringReader
         ts.reset();
         // ??
         while (ts.incrementToken()) {
             System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                     + " | " + type.type());
             if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) {
                 retData.add(term.toString());
             }
         }
         // TokenStreamStringReader
         ts.end(); // Perform end-of-stream operations, e.g. set the final
                   // offset.

     } catch (IOException e) {
         e.printStackTrace();
     } finally {
         // TokenStream?
         if (ts != null) {
             try {
                 ts.close();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
     }
     return retData;
 }

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*from  ww  w. ja  v  a  2  s  .  c  o  m*/
        ts = analyzer.tokenStream("myfield", new StringReader("???"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java

License:Apache License

@Test
public void testIK() {

    String text = "???";

    //IK?smart??/*  w ww  . ja v  a 2  s  .co m*/
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(text));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        analyzer.close();
    }

}