Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.splevo.vpm.analyzer.semantic.lucene.LuceneCodeAnalyzer.java

License:Open Source License

/**
 * Stem a list of words with a configured stemmer.
 *
 * @param words//ww  w  . jav  a 2  s . co  m
 *            The list of words to stem.
 * @param stemming
 *            The stemmer to be used.
 * @return The stemmed list of words.
 */
@SuppressWarnings("resource")
public static String[] stemWords(String[] words, Stemming stemming) {
    Set<String> stemmedStopWords = Sets.newHashSet();

    for (String word : words) {
        TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word));
        tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming);

        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        try {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                stemmedStopWords.add(term);
            }
        } catch (IOException e) {
            logger.error("Failed to stem a list of words", e);
        }
    }
    return stemmedStopWords.toArray(new String[] {});
}

From source file:org.talend.dataquality.standardization.index.SynonymIndexSearcher.java

License:Open Source License

private List<String> getTokensFromAnalyzer(String input) throws IOException {
    StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(input));
    TokenStream result = new StandardFilter(tokenStream);
    result = new LowerCaseFilter(result);
    TermAttribute termAttribute = result.addAttribute(TermAttribute.class);

    List<String> termList = new ArrayList<String>();
    while (result.incrementToken()) {
        String term = termAttribute.term();
        termList.add(term);/*from  www .j a v a2s .  com*/
    }
    return termList;
}

From source file:org.thiesen.jiffs.jobs.preprocessor.Preprocessor.java

License:Open Source License

private void preprocess(StoryDBO story, Analyzer analyzer) {
    final String cleanedText = new HtmlStripper().stripHtml(story.getFullText());

    try {/*from   w  w w  .java  2 s . c  o m*/
        final TokenStream tokenStream = analyzer.reusableTokenStream("dummy", new StringReader(cleanedText));
        final TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);

        final Collection<String> tokens = Sets.newHashSet();
        while (tokenStream.incrementToken()) {
            final String token = termAtt.term();

            if (StringUtils.isNotBlank(token)) {
                tokens.add(token);
            }
        }

        final String tokenString = Joiner.on(',').join(tokens);

        story.setPreprocessedText(tokenString);

        _storyDAO.update(story);
    } catch (IOException e) {
        throw new RuntimeException("IOException on in memory operation ", e);
    }
}

From source file:org.tightblog.service.indexer.AbstractTask.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 *
 * @param field The lucene document field to create a term with
 * @param input The input you wish to convert into a term
 * @return Lucene search term//  ww  w. ja  va2s .c o m
 */
Term getTerm(String field, String input) {
    Term term = null;

    if (input != null && field != null) {
        try (Analyzer analyzer = manager.getAnalyzer()) {
            if (analyzer != null) {
                try {
                    TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
                    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
                    tokens.reset();

                    if (tokens.incrementToken()) {
                        String termt = termAtt.toString();
                        term = new Term(field, termt);
                    }
                } catch (IOException e) {
                    // ignored
                }
            }
        }
    }
    return term;
}

From source file:org.watermint.sourcecolon.org.opensolaris.opengrok.search.Summarizer.java

License:Apache License

private Token[] getTokens(String text) throws IOException {
    //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
    //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter
    ArrayList<Token> result = new ArrayList<>();
    TokenStream ts = analyzer.tokenStream("full", new StringReader(text));
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    while (ts.incrementToken()) {
        Token t = new Token(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
        result.add(t);//ww  w.j  a  va 2  s. c  o  m
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.weborganic.flint.util.Fields.java

License:artistic-license-2.0

/**
 * Returns the terms for a field//w w w.ja va 2  s . c  o  m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
public static List<String> toTerms(String field, String text, Analyzer analyzer) {
    StringReader r = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(field, r);
    PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    List<String> terms = new ArrayList<String>();
    try {
        stream.reset();
        while (stream.incrementToken()) {
            String term = attribute.term();
            terms.add(term);
            // TODO Use increment for the phrase query
            //        System.err.println(term+":"+increment.getPositionIncrement());
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
    return terms;
}

From source file:org.weborganic.flint.util.Queries.java

License:artistic-license-2.0

/**
 * Returns the terms for a field//ww  w .  j a v a2s.  c o  m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) {
    StringReader r = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(field, r);
    PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    try {
        int position = -1;
        stream.reset();
        while (stream.incrementToken()) {
            position += increment.getPositionIncrement();
            Term term = new Term(field, attribute.term());
            phrase.add(term, position);
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
}

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java

License:Apache License

public static ArrayList<String> getTopicWord(String str) {
     // IK?smart??
     Analyzer analyzer = new IKAnalyzer(true);
     ArrayList<String> retData = new ArrayList<String>();
     // ?LuceneTokenStream
     TokenStream ts = null;
     try {/*from   w  w w. ja  v a2s  . c o  m*/
         ts = analyzer.tokenStream("myfield", new StringReader(str));
         // ???
         OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
         // ??
         CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
         // ??
         TypeAttribute type = ts.addAttribute(TypeAttribute.class);

         // ?TokenStream?StringReader
         ts.reset();
         // ??
         while (ts.incrementToken()) {
             System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                     + " | " + type.type());
             if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) {
                 retData.add(term.toString());
             }
         }
         // TokenStreamStringReader
         ts.end(); // Perform end-of-stream operations, e.g. set the final
                   // offset.

     } catch (IOException e) {
         e.printStackTrace();
     } finally {
         // TokenStream?
         if (ts != null) {
             try {
                 ts.close();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
     }
     return retData;
 }

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*from  ww  w. ja  v  a  2  s  .  c  o  m*/
        ts = analyzer.tokenStream("myfield", new StringReader("???"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java

License:Apache License

@Test
public void testIK() {

    String text = "???";

    //IK?smart??/*  w ww  . ja v  a 2  s  .co m*/
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(text));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        analyzer.close();
    }

}