Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java

License:Apache License

private String[] analyze(final String analyzerName, final String text) throws Exception {
    final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName);
    final TokenStream stream = analyzer.tokenStream("default", new StringReader(text));
    stream.reset();/*from w w  w .  ja va  2s. c om*/
    final List<String> result = new ArrayList<String>();
    while (stream.incrementToken()) {
        final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class);
        result.add(c.toString());
    }
    return result.toArray(new String[0]);
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();//  w ww .j  a v  a 2 s. c  o m
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer./*  w  w w .j  av a  2 s  .c o  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    //org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.  This method is suitable for use with TM3
 * fuzzy indices, and does two things differently than createGsTokens():
 * 1) It returns tokens in the order in which they appear
 * 2) It does not collapse duplicate tokens (and correspondingly does
 *    not return count information)/*from   w w  w .j  a  va  2 s.c o m*/
 *
 * @param p_text fuzzy match format string
 * @return List of Strings, each representing one token
 */
public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

@SuppressWarnings("resource")
public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale)
        throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale, false);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();/*from   w w  w  .  ja  va  2 s.c  o m*/

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String input) throws IOException {
    ArrayList<String> tokenList = new ArrayList<String>(256);
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }/*  w ww .j ava  2  s  . c om*/
    return tokenList.toArray(new String[tokenList.size()]);
}

From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java

License:Open Source License

public void parse(String s) {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);/*  w w w  .  j a  va 2 s .c o m*/
    DocumentBuilder builder;
    counts = new HashMap<String, Integer>();
    try {
        builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new InputSource(new StringReader(s)));
        XPathFactory xfactory = XPathFactory.newInstance();
        XPath xpath = xfactory.newXPath();
        title = xpath.evaluate("//page/title/text()", doc);
        title = title.replaceAll("\\s", "_");
        // title = title.replaceAll("^[^a-zA-Z0-9]", "#");
        // title = title.replaceAll("[^a-zA-Z0-9.]", "_");
        id = xpath.evaluate("//page/id/text()", doc);
        String text = xpath.evaluate("//page/revision/text/text()", doc);

        if (!text.isEmpty()) {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
            TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
            while (stream.incrementToken()) {
                String token = stream.getAttribute(TermAttribute.class).term();

                if (dictionary != null && !dictionary.contains(token))
                    continue;

                if (counts.containsKey(token))
                    counts.put(token, counts.get(token) + 1);
                else
                    counts.put(token, 1);
            }
        }
    } catch (ParserConfigurationException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (XPathExpressionException e) {
        e.printStackTrace();
    }
}

From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java

License:Open Source License

private void test(String name, Analyzer a, String text) throws IOException {
    final Reader r = new StringReader(text);
    final TokenStream s = a.tokenStream(null, r);
    List<String> list = Lists.newLinkedList();
    s.reset();/* w w  w . j a  va  2s . c o  m*/
    while (s.incrementToken()) {
        if (s.hasAttribute(CharTermAttribute.class)) {
            list.add(s.getAttribute(CharTermAttribute.class).toString());
        }
    }
    System.out.printf("[%s] %s => %s\n", name, text, list);
}

From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java

License:Open Source License

@Override
protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException {
    try {/*from   w  ww. ja  va2 s. c o m*/
        TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr));
        if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) {
            String term = ts.getAttribute(CharTermAttribute.class).toString();
            if (term != null) {
                return super.getPrefixQuery(field, term);
            }
        }
    } catch (IOException e) {
    }
    return super.getPrefixQuery(field, termStr);
}

From source file:com.lou.simhasher.seg.WordsSegment.java

License:Open Source License

/**
 * ?//ww  w.j av a2  s .com
 * 
 * @param str 
 * @return
 */
public static List<String> getCutWords(String str) {
    Analyzer analyzer = new IKAnalyzer();
    Reader r = new StringReader(str);
    TokenStream ts = analyzer.tokenStream("searchValue", r);
    ts.addAttribute(CharTermAttribute.class);

    List<String> list = new ArrayList<String>();
    try {
        while (ts.incrementToken()) {
            CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
            String word = ta.toString();
            list.add(word);
        }
    } catch (IOException e) {
        logger.error("?IO" + e.getMessage());
    }
    return list;
}