Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.wltea.analyzer.sample.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??//from  w  w w  . j  a v  a2s  .  c  o m
    Analyzer analyzer = new IKAnalyzerP(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        //         ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //         ts = analyzer.tokenStream("myfield", new StringReader("???pinyin hanyu Contribute index to jpinyin development by creating an account on GitHub"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?StandardTokenizer//from   www .  j ava2  s  .  c o  m
 */
public void testST() {
    Tokenizer tokenizer = new StandardTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?ClassTokenizer/*from  ww  w .  j a v a2  s .c o  m*/
 */
public void testCT() {
    Tokenizer tokenizer = new ClassicTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?NGramTokenizer//  w w w. ja v  a  2 s . c  om
 * min:1,max:2
 */
public void testNT() {
    Tokenizer tokenizer = new NGramTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.ThulacAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //Thulac?smart??
    Analyzer analyzer = new ThulacAnalyzer(true);
    //?LuceneTokenStream
    TokenStream ts = null;
    try {//from w w w.j  av  a2 s  .c o m
        long start = System.currentTimeMillis();
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
        System.out.println("wast:" + (System.currentTimeMillis() - start));
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.test.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??/*from   ww  w. j  a v a 2s .  com*/
    Analyzer analyzer = new IKAnalyzer4PinYin(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.xbib.elasticsearch.test.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }/*from w  w  w  .  java 2s  .  c om*/

        BytesRef pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.bytes) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }
    }
    System.out.println();
}

From source file:org.zenoss.zep.index.impl.lucene.LuceneQueryBuilder.java

License:Open Source License

/**
 * Tokenizes the given query using the same behavior as when the field is analyzed.
 *
 * @param fieldName The field name in the index.
 * @param analyzer  The analyzer to use to tokenize the query.
 * @param query     The query to tokenize.
 * @return The tokens from the query./*  w  w  w  .  j  av a2 s  .  c o  m*/
 * @throws ZepException If an exception occur.
 */
private static List<String> getTokens(String fieldName, Analyzer analyzer, String query) throws ZepException {
    final List<String> tokens = new ArrayList<String>();
    try {
        TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(query));
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        try {
            ts.reset();
            while (ts.incrementToken()) {
                tokens.add(term.toString());
            }
            ts.end();
        } catch (IOException e) {
            throw new ZepException(e.getLocalizedMessage(), e);
        } finally {
            ts.close();
        }
    } catch (IOException e) {
        throw new ZepException(e.getLocalizedMessage(), e);
    }
    return tokens;
}

From source file:pl.litwiniuk.rowicki.modsynonyms.SlowSynonymFilterFactory.java

License:Apache License

private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException {
    StringReader reader = new StringReader(source);
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {//www  .j  a v  a 2  s . co  m
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            if (termAtt.length() > 0)
                tokList.add(termAtt.toString());
        }
    } finally {
        reader.close();
    }
    return tokList;
}

From source file:practica2_1.Practica2_1.java

public static List<String> tokenizeString(Analyzer analyzer, String string) {
    List<String> result = new ArrayList<String>();

    String cad;/*from ww w  .j  a v a2s .  com*/
    try {

        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        //OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute cAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();

        while (stream.incrementToken()) {
            //cad = stream.getAttribute(CharTermAttribute.class).toString();
            result.add(cAtt.toString());
        }
        stream.close();
        stream.end();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;
}