Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java

License:Apache License

private String[] analyze(final String analyzerName, final String text) throws Exception {
    final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName);
    final TokenStream stream = analyzer.tokenStream("default", new StringReader(text));
    stream.reset();/*from w w  w .  ja va  2s. c om*/
    final List<String> result = new ArrayList<String>();
    while (stream.incrementToken()) {
        final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class);
        result.add(c.toString());
    }
    return result.toArray(new String[0]);
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();//  w ww .j  a v  a 2 s. c  o m
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer./*  w  w w .j  av a  2 s  .c o  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    //org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.  This method is suitable for use with TM3
 * fuzzy indices, and does two things differently than createGsTokens():
 * 1) It returns tokens in the order in which they appear
 * 2) It does not collapse duplicate tokens (and correspondingly does
 *    not return count information)/*from   w w  w .j  a  va  2 s.c o m*/
 *
 * @param p_text fuzzy match format string
 * @return List of Strings, each representing one token
 */
public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

@SuppressWarnings("resource")
public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale)
        throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale, false);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();/*from   w w  w  .  ja  va  2 s.c  o m*/

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String input) throws IOException {
    ArrayList<String> tokenList = new ArrayList<String>(256);
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }/*  w ww .j ava  2  s  . c om*/
    return tokenList.toArray(new String[tokenList.size()]);
}

From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java

License:Open Source License

public void parse(String s) {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);/*  w w w  .  j a  va 2 s .c o m*/
    DocumentBuilder builder;
    counts = new HashMap<String, Integer>();
    try {
        builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new InputSource(new StringReader(s)));
        XPathFactory xfactory = XPathFactory.newInstance();
        XPath xpath = xfactory.newXPath();
        title = xpath.evaluate("//page/title/text()", doc);
        title = title.replaceAll("\\s", "_");
        // title = title.replaceAll("^[^a-zA-Z0-9]", "#");
        // title = title.replaceAll("[^a-zA-Z0-9.]", "_");
        id = xpath.evaluate("//page/id/text()", doc);
        String text = xpath.evaluate("//page/revision/text/text()", doc);

        if (!text.isEmpty()) {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
            TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
            while (stream.incrementToken()) {
                String token = stream.getAttribute(TermAttribute.class).term();

                if (dictionary != null && !dictionary.contains(token))
                    continue;

                if (counts.containsKey(token))
                    counts.put(token, counts.get(token) + 1);
                else
                    counts.put(token, 1);
            }
        }
    } catch (ParserConfigurationException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (XPathExpressionException e) {
        e.printStackTrace();
    }
}

From source file:com.isotrol.impe3.lucene.PortalSpanishAnalyzerTest.java

License:Open Source License

private void test(String name, Analyzer a, String text) throws IOException {
    final Reader r = new StringReader(text);
    final TokenStream s = a.tokenStream(null, r);
    List<String> list = Lists.newLinkedList();
    s.reset();/* w w  w . j a  va  2s . c o  m*/
    while (s.incrementToken()) {
        if (s.hasAttribute(CharTermAttribute.class)) {
            list.add(s.getAttribute(CharTermAttribute.class).toString());
        }
    }
    System.out.printf("[%s] %s => %s\n", name, text, list);
}

From source file:com.isotrol.impe3.lucene.PrefixAnalyzedQueryParser.java

License:Open Source License

@Override
protected org.apache.lucene.search.Query getPrefixQuery(String field, String termStr) throws ParseException {
    try {/*from   w  ww. ja  va2 s. c o m*/
        TokenStream ts = analyzer.tokenStream(field, new StringReader(termStr));
        if (ts.incrementToken() && ts.hasAttribute(CharTermAttribute.class)) {
            String term = ts.getAttribute(CharTermAttribute.class).toString();
            if (term != null) {
                return super.getPrefixQuery(field, term);
            }
        }
    } catch (IOException e) {
    }
    return super.getPrefixQuery(field, termStr);
}

From source file:com.lou.simhasher.seg.WordsSegment.java

License:Open Source License

/**
 * ?//ww  w.j av a2  s .com
 * 
 * @param str 
 * @return
 */
public static List<String> getCutWords(String str) {
    Analyzer analyzer = new IKAnalyzer();
    Reader r = new StringReader(str);
    TokenStream ts = analyzer.tokenStream("searchValue", r);
    ts.addAttribute(CharTermAttribute.class);

    List<String> list = new ArrayList<String>();
    try {
        while (ts.incrementToken()) {
            CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
            String word = ta.toString();
            list.add(word);
        }
    } catch (IOException e) {
        logger.error("?IO" + e.getMessage());
    }
    return list;
}