Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java

License:Open Source License

private static String analyzeTextToFind(String textToFind) throws IOException {
    // Filter textToFind through GreekAnalyzer
    TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind));
    stream.reset();//from   w w  w.j a va 2s  . co  m

    StringBuilder analyzedTextTofind = new StringBuilder();

    try {
        while (stream.incrementToken()) {

            String term = stream.getAttribute(TermAttribute.class).term();

            analyzedTextTofind.append(term);
            analyzedTextTofind.append(" ");

        }
    } catch (IOException e) {
        e.printStackTrace();

        analyzedTextTofind.append(textToFind);
    } finally {
        stream.end();
        stream.close();

    }

    String result = analyzedTextTofind.toString().trim();

    if (StringUtils.isBlank(result))
        return textToFind;

    return result;

}

From source file:org.bitsofinfo.util.address.usps.ais.index.USPSRecordAnalyzer.java

License:Apache License

/**
* Filters a string or word//w w w . ja v a 2 s  .  co m
* through same filters as when doc is indexed
*
* @param      words   String word
* @return     words   that are analyzed
*/
public String filter(String words) {
    StringReader reader = new StringReader(words);
    TokenStream stream = tokenStream(null, reader);
    StringBuffer sb = new StringBuffer();

    try {
        while (stream.incrementToken()) {
            sb.append(stream.getAttribute(TermAttribute.class).term());
            sb.append(" ");
        }
    } catch (Exception e) {
        System.out.println("Error in MrmAnalyzer filter(): " + e);
    }

    return sb.toString().trim();
}

From source file:org.chililog.server.common.TextTokenizer.java

License:Apache License

/**
 * <p>/*from   www . j  a va2 s  .  c o  m*/
 * Tokenizes text to get keywords
 * </p>
 * <p>
 * We use lucene <code>StandardAnalyzer</code> with a bit of spice. We want to break up domain names, class names
 * and emails so we have to do some extra parsing.
 * </p>
 * <p>
 * Lucene parsing:
 * <ul>
 * <li>"email@address.com" = ["email@address", "com"]</li>
 * <li>"com.chililog.server.common.ChiliLogExceptionTest" = ["com.chililog.server.common", "chililogexceptiontest"]</li>
 * </ul>
 * </p>
 * <p>
 * We have not used regular expression because it is slow. We have implemented this as a singleton so that in the
 * future we can allow user customization.
 * </p>
 * 
 * @param text
 *            Text to extract keywords
 * @param maxKeywords
 *            Maximum number of keywords to extract. If < 0, then no limit will be used.
 * @return Array of keywords
 * @throws IOException
 */
public ArrayList<String> tokenize(String text, long maxKeywords) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();

    if (StringUtils.isEmpty(text) || maxKeywords == 0) {
        return tokens;
    }

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    HashMap<String, String> lookup = new HashMap<String, String>();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    StringBuilder sb = new StringBuilder();
    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        char[] termBuffer = termAttribute.termBuffer();
        int length = termAttribute.termLength();

        boolean doSplit = true;

        // Check if we want to split
        if (Character.isDigit(termBuffer[0])) {
            doSplit = false;
        } else {
            for (int j = 0; j < length; j++) {
                char c = termBuffer[j];
                if (!Character.isLetterOrDigit(c) && c != '.' && c != '@') {
                    doSplit = false;
                    break;
                }
            }
        }

        if (doSplit) {
            sb.setLength(0);
            for (int i = 0; i < length; i++) {
                char c = termBuffer[i];
                if (c == '.' || c == '@') {
                    if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) {
                        return tokens;
                    }
                    sb.setLength(0);
                } else {
                    sb.append(c);
                }
            }

            // Add last part
            if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) {
                return tokens;
            }
        } else {
            // No splitting, just add term
            if (!addToken(tokens, lookup, termAttribute.term(), maxKeywords)) {
                return tokens;
            }
        }
    }

    return tokens;
}

From source file:org.chililog.server.common.TextTokenizerTest.java

License:Apache License

/**
 * Used for benchmarking ... basic tokenizing without regular expression
 * /*  w w w .j ava  2s  . co m*/
 * @param text
 * @return
 * @throws IOException
 */
public List<String> basicTokenize(String text) throws IOException {
    List<String> tokens = new ArrayList<String>();

    if (StringUtils.isEmpty(text)) {
        return tokens;
    }

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    HashMap<String, String> lookup = new HashMap<String, String>();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        String term = termAttribute.term();
        if (!lookup.containsKey(term)) {
            tokens.add(term);
            lookup.put(term, null);
        }
    }

    return tokens;
}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * @param text//from  w w w  .  ja  v  a 2 s. c  om
 * @param analyzer
 * @return
 * @throws IOException
 */
public static List<String> tokenize(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    List<String> tokens = new ArrayList<String>();

    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        tokens.add(token);
    }

    return tokens;
}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * Analyzes text and return analyzed text
 * @param text//from ww w  . jav  a 2  s.  c  o m
 * @return
 * @throws IOException
 */
public static String analyze(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    StringBuilder stBld = new StringBuilder();

    stream.reset();
    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        stBld.append(token).append(" ");
    }
    stream.end();
    stream.close();
    return stBld.toString();
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java

License:Apache License

private void assertTokenStream(TokenStream stream, String expectedStream) throws Exception {

    String[] expectedTokens = expectedStream.split("/");
    int count = 0;
    for (String expectedToken : expectedTokens) {
        String[] attrs = expectedToken.split(",");
        assertTrue(stream.incrementToken());

        String term = attrs[0];/* ww w .  ja  v  a  2 s  . c o m*/
        assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString());

        if (attrs.length > 1) {
            int so = Integer.parseInt(attrs[1]);
            assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset());

            if (attrs.length > 2) {
                int eo = Integer.parseInt(attrs[2]);
                assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset());

                if (attrs.length > 3) {
                    int pi = Integer.parseInt(attrs[3]);
                    assertAttribute(count, "posInc", pi,
                            stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                }
            }
        }
        count++;
    }
    assertFalse(stream.incrementToken());
}

From source file:org.cosmo.common.util.WordUtil.java

License:Apache License

public static void main(String[] args) throws Exception {

    StringReader reader = new StringReader(
            "CNN, CNN news, CNN.com, CNN TV, news, news online, breaking news, U.S. news, world news, weather, business, CNN Money, sports, politics, law, technology, entertainment, education, travel, health, special reports, autos, developing story, news video, CNN Intl");
    /*/* w  w w  .j  a v a2s.com*/
    LetterTokenizer tokenizer = new LetterTokenizer(reader);
    AttributeSource filter = new StopFilter(true, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
            
    while (filter.hasAttributes()) {
       Attribute attribute = filter.captureState().
       System.out.println(attribute);
    }
    */
    StopAnalyzer analyzer = new StopAnalyzer(Index.Version);
    Set<String> uniqueTerms = new HashSet();
    TokenStream tokenStream = analyzer.reusableTokenStream("anyting", reader);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        TermAttribute term = tokenStream.getAttribute(TermAttribute.class);
        uniqueTerms.add(term.term());
    }
    tokenStream.end();
    tokenStream.close();

    System.out.println(Arrays.toString(uniqueTerms.toArray()));

}

From source file:org.easynet.resource.queryparser.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    if (analyzerIn == null)
        analyzerIn = getAnalyzer();/*w  w w  . ja v  a  2 s.c om*/

    TokenStream source = null;
    try {
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.eclipse.help.internal.search.QueryBuilder.java

License:Open Source License

/**
 * Get a list of tokens corresponding to a search word or phrase
 * /*  w ww  .jav a  2 s. c  o m*/
 * @return List of String
 */
private List<String> analyzeText(Analyzer analyzer, String fieldName, String text) {
    List<String> words = new ArrayList<String>(1);
    Reader reader = new StringReader(text);
    TokenStream tStream = analyzer.tokenStream(fieldName, reader);

    CharTermAttribute termAttribute = (CharTermAttribute) tStream.getAttribute(CharTermAttribute.class);
    try {
        while (tStream.incrementToken()) {
            String term = termAttribute.toString();
            words.add(term);
        }
        reader.close();
    } catch (IOException ioe) {
    }

    return words;
}