Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:jp.mwsoft.cjkanalyzers.CJKAnalyzerNoSplitKatakana.java

License:Apache License

public static void main(String[] args) throws Exception {

    Set<String> stopWords = new HashSet<String>();
    stopWords.add("??");
    stopWords.add("??");

    java.io.StringReader reader = new java.io.StringReader("??????");

    CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_35);
    TokenStream stream = analyzer.tokenStream("test", reader);

    for (int i = 0; i < 10; i++) {
        stream.incrementToken();
        System.out.println(stream);
    }//from   ww  w  . ja v  a 2  s .  com
}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (!analyzeWildcard) {
        return super.getPrefixQuery(field, termStr);
    }//w w  w  .ja v  a 2 s. co m
    // get Analyzer from superclass and tokenize the term
    TokenStream source;
    try {
        source = getAnalyzer().tokenStream(field, termStr);
        source.reset();
    } catch (IOException e) {
        return super.getPrefixQuery(field, termStr);
    }
    List<String> tlist = new ArrayList<>();
    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken())
                break;
        } catch (IOException e) {
            break;
        }
        tlist.add(termAtt.toString());
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (tlist.size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0));
    } else {
        // build a boolean query with prefix on each one...
        List<BooleanClause> clauses = new ArrayList<>();
        for (String token : tlist) {
            clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
        }
        return getBooleanQuery(clauses, true);

        //return super.getPrefixQuery(field, termStr);

        /* this means that the analyzer used either added or consumed
        * (common for a stemmer) tokens, and we can't build a PrefixQuery */
        //            throw new ParseException("Cannot build PrefixQuery with analyzer "
        //                    + getAnalyzer().getClass()
        //                    + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
    }

}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException {
    if (!analyzeWildcard) {
        return super.getWildcardQuery(field, termStr);
    }/*from w  w  w.  j  a  v  a 2 s .c  om*/
    boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
    StringBuilder aggStr = new StringBuilder();
    StringBuilder tmp = new StringBuilder();
    for (int i = 0; i < termStr.length(); i++) {
        char c = termStr.charAt(i);
        if (c == '?' || c == '*') {
            if (isWithinToken) {
                try {
                    TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
                    source.reset();
                    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
                    if (source.incrementToken()) {
                        String term = termAtt.toString();
                        if (term.length() == 0) {
                            // no tokens, just use what we have now
                            aggStr.append(tmp);
                        } else {
                            aggStr.append(term);
                        }
                    } else {
                        // no tokens, just use what we have now
                        aggStr.append(tmp);
                    }
                    source.close();
                } catch (IOException e) {
                    aggStr.append(tmp);
                }
                tmp.setLength(0);
            }
            isWithinToken = false;
            aggStr.append(c);
        } else {
            tmp.append(c);
            isWithinToken = true;
        }
    }
    if (isWithinToken) {
        try {
            TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
            source.reset();
            CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
            if (source.incrementToken()) {
                String term = termAtt.toString();
                if (term.length() == 0) {
                    // no tokens, just use what we have now
                    aggStr.append(tmp);
                } else {
                    aggStr.append(term);
                }
            } else {
                // no tokens, just use what we have now
                aggStr.append(tmp);
            }
            source.close();
        } catch (IOException e) {
            aggStr.append(tmp);
        }
    }

    return super.getWildcardQuery(field, aggStr.toString());
}

From source file:jp.sf.fess.solr.plugin.analysis.ja.TestJapaneseNumberFilter.java

License:Apache License

public void analyze(final Analyzer analyzer, final Reader reader, final Writer writer) throws IOException {
    final TokenStream stream = analyzer.tokenStream("dummy", reader);

    stream.reset();//from   w w  w .  j  ava2  s  .  c  o m

    final CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        writer.write(termAttr.toString());
        writer.write("\n");
    }

    reader.close();
    writer.close();
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java

License:Apache License

private void assertTokenStream(final TokenStream stream, final String expectedStream) throws Exception {

    final String[] expectedTokens = expectedStream.split("/");
    int count = 0;
    for (final String expectedToken : expectedTokens) {
        final String[] attrs = expectedToken.split(",");
        assertTrue(stream.incrementToken());

        final String term = attrs[0];
        assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString());

        if (attrs.length > 1) {
            final int so = Integer.parseInt(attrs[1]);
            assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset());

            if (attrs.length > 2) {
                final int eo = Integer.parseInt(attrs[2]);
                assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset());

                if (attrs.length > 3) {
                    final int pi = Integer.parseInt(attrs[3]);
                    assertAttribute(count, "posInc", pi,
                            stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                }/* ww  w .  j  a v a 2  s.  com*/
            }
        }
        count++;
    }
    assertFalse(stream.incrementToken());
}

From source file:kafka.examples.Producer.java

License:Apache License

public void run() {
    while (true) {
        String access_token = "2.009F1d9BmHHChD7abcd6de0a0jui5Y";
        int count = 20;
        Timeline tm = new Timeline(access_token);
        Analyzer analyzer4 = new IKAnalyzer(false);// ?

        try {/*from   ww w . j a va  2s . c o m*/
            StatusWapper status = tm.getPublicTimeline(count, 0);
            //-------------------------------------------
            try {
                TokenStream tokenstream = analyzer4.tokenStream("", new StringReader(status.toString()));
                CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// token

                tokenstream.reset();// ?

                while (tokenstream.incrementToken()) {// ??token
                    String prTxt = new String(termAttribute.buffer(), 0, termAttribute.length());
                    //producer.send(new KeyedMessage<Integer, String>(topic, ptTxt + " "));
                    System.out.print(prTxt + "  ");
                }
                //System.out.println();
                tokenstream.close();//TokenStream
            } catch (IOException e) {
                e.printStackTrace();
            }
            //-------------------------------------------
            producer.send(new KeyedMessage<Integer, String>(topic, status.toString()));
            Log.logInfo(status.toString());

        } catch (WeiboException e) {
            e.printStackTrace();
        }
    }
}

From source file:lia.analysis.CopyOfAnalyzerDemo.java

License:Apache License

private static void analyze(String text) throws IOException {
    System.out.println("Analyzing \"" + text + "\"");
    for (Analyzer analyzer : analyzers) {
        String name = analyzer.getClass().getSimpleName();
        System.out.println(name);
        TokenStream stream = analyzer.tokenStream("dummy", text);
        stream.reset();/* ww  w  . j  av a2 s  . c o m*/

        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);
        PositionIncrementAttribute positionAttr = stream.addAttribute(PositionIncrementAttribute.class);

        while (stream.incrementToken()) {

            System.out.print("[" + termAttr + "] ");
        }
        System.out.println("");
    }
}

From source file:lia.analysis.i18n.ChineseDemo.java

License:Apache License

private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    TermAttribute term = stream.addAttribute(TermAttribute.class);

    while (stream.incrementToken()) { //C
        buffer.append("[");
        buffer.append(term.term());//from w ww  . j  a v a2s.  co m
        buffer.append("] ");
    }

    String output = buffer.toString();

    Frame f = new Frame();
    f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
    f.setResizable(true);

    Font font = new Font(null, Font.PLAIN, 36);
    int width = getWidth(f.getFontMetrics(font), output);

    f.setSize((width < 250) ? 250 : width + 50, 75);

    // NOTE: if Label doesn't render the Chinese characters
    // properly, try using javax.swing.JLabel instead
    JLabel label = new JLabel(output); //D
    label.setSize(width, 75);
    //label.setAlignment(JLabel.CENTER);
    label.setFont(font);
    f.add(label);

    f.setVisible(true);
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    TermToBytesRefAttribute term = stream.addAttribute(TermToBytesRefAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.getBytesRef().utf8ToString() + "] "); //B
    }//ww w. jav a2s .co m
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    TermToBytesRefAttribute term = stream.addAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }//w  w  w.java2 s  .  c  o  m

        System.out.print("[" + term.getBytesRef().utf8ToString() + "] ");
    }
    System.out.println();
}