Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.bitsofinfo.util.address.usps.ais.index.USPSRecordAnalyzer.java

License:Apache License

/**
* Filters a string or word/*from w w w  .ja va  2s .  c  o m*/
* through same filters as when doc is indexed
*
* @param      words   String word
* @return     words   that are analyzed
*/
public String filter(String words) {
    StringReader reader = new StringReader(words);
    TokenStream stream = tokenStream(null, reader);
    StringBuffer sb = new StringBuffer();

    try {
        while (stream.incrementToken()) {
            sb.append(stream.getAttribute(TermAttribute.class).term());
            sb.append(" ");
        }
    } catch (Exception e) {
        System.out.println("Error in MrmAnalyzer filter(): " + e);
    }

    return sb.toString().trim();
}

From source file:org.chililog.server.common.TextTokenizer.java

License:Apache License

/**
 * <p>//  ww w  .  j  av a2s .c o m
 * Tokenizes text to get keywords
 * </p>
 * <p>
 * We use lucene <code>StandardAnalyzer</code> with a bit of spice. We want to break up domain names, class names
 * and emails so we have to do some extra parsing.
 * </p>
 * <p>
 * Lucene parsing:
 * <ul>
 * <li>"email@address.com" = ["email@address", "com"]</li>
 * <li>"com.chililog.server.common.ChiliLogExceptionTest" = ["com.chililog.server.common", "chililogexceptiontest"]</li>
 * </ul>
 * </p>
 * <p>
 * We have not used regular expression because it is slow. We have implemented this as a singleton so that in the
 * future we can allow user customization.
 * </p>
 * 
 * @param text
 *            Text to extract keywords
 * @param maxKeywords
 *            Maximum number of keywords to extract. If < 0, then no limit will be used.
 * @return Array of keywords
 * @throws IOException
 */
public ArrayList<String> tokenize(String text, long maxKeywords) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    if (StringUtils.isEmpty(text) || maxKeywords == 0) {
        return tokens;
    }

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    HashMap<String, String> lookup = new HashMap<String, String>();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    StringBuilder sb = new StringBuilder();
    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        char[] termBuffer = termAttribute.termBuffer();
        int length = termAttribute.termLength();

        boolean doSplit = true;

        // Check if we want to split
        if (Character.isDigit(termBuffer[0])) {
            doSplit = false;
        } else {
            for (int j = 0; j < length; j++) {
                char c = termBuffer[j];
                if (!Character.isLetterOrDigit(c) && c != '.' && c != '@') {
                    doSplit = false;
                    break;
                }
            }
        }

        if (doSplit) {
            sb.setLength(0);
            for (int i = 0; i < length; i++) {
                char c = termBuffer[i];
                if (c == '.' || c == '@') {
                    if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) {
                        return tokens;
                    }
                    sb.setLength(0);
                } else {
                    sb.append(c);
                }
            }

            // Add last part
            if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) {
                return tokens;
            }
        } else {
            // No splitting, just add term
            if (!addToken(tokens, lookup, termAttribute.term(), maxKeywords)) {
                return tokens;
            }
        }
    }

    return tokens;
}

From source file:org.chililog.server.common.TextTokenizerTest.java

License:Apache License

/**
 * Used for benchmarking ... basic tokenizing without regular expression
 * //from   w  ww . j av a  2 s. c  om
 * @param text
 * @return
 * @throws IOException
 */
public List<String> basicTokenize(String text) throws IOException {
    List<String> tokens = new ArrayList<String>();

    if (StringUtils.isEmpty(text)) {
        return tokens;
    }

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    HashMap<String, String> lookup = new HashMap<String, String>();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        String term = termAttribute.term();
        if (!lookup.containsKey(term)) {
            tokens.add(term);
            lookup.put(term, null);
        }
    }

    return tokens;
}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * @param text/*from  w  ww.j a v a  2  s  .  com*/
 * @param analyzer
 * @return
 * @throws IOException
 */
public static List<String> tokenize(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    List<String> tokens = new ArrayList<String>();

    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        tokens.add(token);
    }

    return tokens;
}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * Analyzes text and return analyzed text
 * @param text//from   w ww .ja v a 2 s  .  co m
 * @return
 * @throws IOException
 */
public static String analyze(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    StringBuilder stBld = new StringBuilder();

    stream.reset();
    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        stBld.append(token).append(" ");
    }
    stream.end();
    stream.close();
    return stBld.toString();
}

From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java

License:Apache License

/** NOTE: this method closes the TokenStream, even on exception, which is awkward
 *  because really the caller who called {Analyzer#tokenStream} should close it,
 *  but when trying that there are recursion issues when we try to use the same
 *  TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
    int numTokens = 0;
    boolean success = false;
    try {//w  w w .  java 2 s . c  o m
        stream.reset();
        consumer.reset(stream);
        while (stream.incrementToken()) {
            consumer.nextToken();
            numTokens++;
        }
        consumer.end();
        success = true;
    } finally {
        if (success) {
            stream.close();
        } else {
            IOUtils.closeWhileHandlingException(stream);
        }
    }
    return numTokens;
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java

License:Apache License

private void assertTokenStream(TokenStream stream, String expectedStream) throws Exception {

    String[] expectedTokens = expectedStream.split("/");
    int count = 0;
    for (String expectedToken : expectedTokens) {
        String[] attrs = expectedToken.split(",");
        assertTrue(stream.incrementToken());

        String term = attrs[0];//www  .ja v  a 2  s. co  m
        assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString());

        if (attrs.length > 1) {
            int so = Integer.parseInt(attrs[1]);
            assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset());

            if (attrs.length > 2) {
                int eo = Integer.parseInt(attrs[2]);
                assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset());

                if (attrs.length > 3) {
                    int pi = Integer.parseInt(attrs[3]);
                    assertAttribute(count, "posInc", pi,
                            stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                }
            }
        }
        count++;
    }
    assertFalse(stream.incrementToken());
}

From source file:org.cosmo.common.util.WordUtil.java

License:Apache License

public static void main(String[] args) throws Exception {

    StringReader reader = new StringReader(
            "CNN, CNN news, CNN.com, CNN TV, news, news online, breaking news, U.S. news, world news, weather, business, CNN Money, sports, politics, law, technology, entertainment, education, travel, health, special reports, autos, developing story, news video, CNN Intl");
    /*//w ww. j  av  a 2  s. c  o  m
    LetterTokenizer tokenizer = new LetterTokenizer(reader);
    AttributeSource filter = new StopFilter(true, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
            
    while (filter.hasAttributes()) {
       Attribute attribute = filter.captureState().
       System.out.println(attribute);
    }
    */
    StopAnalyzer analyzer = new StopAnalyzer(Index.Version);
    Set<String> uniqueTerms = new HashSet();
    TokenStream tokenStream = analyzer.reusableTokenStream("anyting", reader);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        TermAttribute term = tokenStream.getAttribute(TermAttribute.class);
        uniqueTerms.add(term.term());
    }
    tokenStream.end();
    tokenStream.close();

    System.out.println(Arrays.toString(uniqueTerms.toArray()));

}

From source file:org.dbpedia.spotlight.lucene.analysis.NGramAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String myString = "cancer";
    Analyzer analyzer = new NGramAnalyzer(3, 3);
    System.out.println("Analyzing: \"" + myString + "\"");
    StringReader reader = new StringReader(myString);
    TokenStream stream = analyzer.tokenStream("field", reader);
    //        TokenStream stream = new NGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 1,2);
    stream.reset();/*w ww  .jav  a 2s  . com*/

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println("token: " + stream);
    }

    stream.end();
    stream.close();
}

From source file:org.dbpedia.spotlight.lucene.analysis.PhoneticAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String myString = "cancer";
    Analyzer analyzer = new PhoneticAnalyzer(Version.LUCENE_36, SpotlightConfiguration.DEFAULT_STOPWORDS);
    System.out.println("Analyzing: \"" + myString + "\"");
    StringReader reader = new StringReader(myString);
    TokenStream stream = analyzer.tokenStream("field", reader);
    stream.reset();/*www  .  j  a  v  a  2 s. c o m*/

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println("token: " + stream);
    }

    stream.end();
    stream.close();
}