Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:jp.mwsoft.cjkanalyzers.CJKAnalyzerNoSplitKatakana.java

License:Apache License

public static void main(String[] args) throws Exception {

    Set<String> stopWords = new HashSet<String>();
    stopWords.add("??");
    stopWords.add("??");

    java.io.StringReader reader = new java.io.StringReader("??????");

    CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_35);
    TokenStream stream = analyzer.tokenStream("test", reader);

    for (int i = 0; i < 10; i++) {
        stream.incrementToken();
        System.out.println(stream);
    }//from   ww  w  . ja v  a 2  s .  com
}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (!analyzeWildcard) {
        return super.getPrefixQuery(field, termStr);
    }//w w  w  .ja v  a 2 s. co m
    // get Analyzer from superclass and tokenize the term
    TokenStream source;
    try {
        source = getAnalyzer().tokenStream(field, termStr);
        source.reset();
    } catch (IOException e) {
        return super.getPrefixQuery(field, termStr);
    }
    List<String> tlist = new ArrayList<>();
    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken())
                break;
        } catch (IOException e) {
            break;
        }
        tlist.add(termAtt.toString());
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (tlist.size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0));
    } else {
        // build a boolean query with prefix on each one...
        List<BooleanClause> clauses = new ArrayList<>();
        for (String token : tlist) {
            clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
        }
        return getBooleanQuery(clauses, true);

        //return super.getPrefixQuery(field, termStr);

        /* this means that the analyzer used either added or consumed
        * (common for a stemmer) tokens, and we can't build a PrefixQuery */
        //            throw new ParseException("Cannot build PrefixQuery with analyzer "
        //                    + getAnalyzer().getClass()
        //                    + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
    }

}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException {
    if (!analyzeWildcard) {
        return super.getWildcardQuery(field, termStr);
    }/*from w  w  w.  j  a  v  a 2 s .c  om*/
    boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
    StringBuilder aggStr = new StringBuilder();
    StringBuilder tmp = new StringBuilder();
    for (int i = 0; i < termStr.length(); i++) {
        char c = termStr.charAt(i);
        if (c == '?' || c == '*') {
            if (isWithinToken) {
                try {
                    TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
                    source.reset();
                    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
                    if (source.incrementToken()) {
                        String term = termAtt.toString();
                        if (term.length() == 0) {
                            // no tokens, just use what we have now
                            aggStr.append(tmp);
                        } else {
                            aggStr.append(term);
                        }
                    } else {
                        // no tokens, just use what we have now
                        aggStr.append(tmp);
                    }
                    source.close();
                } catch (IOException e) {
                    aggStr.append(tmp);
                }
                tmp.setLength(0);
            }
            isWithinToken = false;
            aggStr.append(c);
        } else {
            tmp.append(c);
            isWithinToken = true;
        }
    }
    if (isWithinToken) {
        try {
            TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
            source.reset();
            CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
            if (source.incrementToken()) {
                String term = termAtt.toString();
                if (term.length() == 0) {
                    // no tokens, just use what we have now
                    aggStr.append(tmp);
                } else {
                    aggStr.append(term);
                }
            } else {
                // no tokens, just use what we have now
                aggStr.append(tmp);
            }
            source.close();
        } catch (IOException e) {
            aggStr.append(tmp);
        }
    }

    return super.getWildcardQuery(field, aggStr.toString());
}

From source file:jp.sf.fess.solr.plugin.analysis.ja.TestJapaneseNumberFilter.java

License:Apache License

public void analyze(final Analyzer analyzer, final Reader reader, final Writer writer) throws IOException {
    final TokenStream stream = analyzer.tokenStream("dummy", reader);

    stream.reset();//from   w w  w .  j  ava2  s  .  c  o m

    final CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        writer.write(termAttr.toString());
        writer.write("\n");
    }

    reader.close();
    writer.close();
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java

License:Apache License

private void assertTokenStream(final TokenStream stream, final String expectedStream) throws Exception {

    final String[] expectedTokens = expectedStream.split("/");
    int count = 0;
    for (final String expectedToken : expectedTokens) {
        final String[] attrs = expectedToken.split(",");
        assertTrue(stream.incrementToken());

        final String term = attrs[0];
        assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString());

        if (attrs.length > 1) {
            final int so = Integer.parseInt(attrs[1]);
            assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset());

            if (attrs.length > 2) {
                final int eo = Integer.parseInt(attrs[2]);
                assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset());

                if (attrs.length > 3) {
                    final int pi = Integer.parseInt(attrs[3]);
                    assertAttribute(count, "posInc", pi,
                            stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                }/* ww  w .  j  a v a 2  s.  com*/
            }
        }
        count++;
    }
    assertFalse(stream.incrementToken());
}

From source file:kafka.examples.Producer.java

License:Apache License

public void run() {
    while (true) {
        String access_token = "2.009F1d9BmHHChD7abcd6de0a0jui5Y";
        int count = 20;
        Timeline tm = new Timeline(access_token);
        Analyzer analyzer4 = new IKAnalyzer(false);// ?

        try {/*from   ww w . j a va  2s . c o m*/
            StatusWapper status = tm.getPublicTimeline(count, 0);
            //-------------------------------------------
            try {
                TokenStream tokenstream = analyzer4.tokenStream("", new StringReader(status.toString()));
                CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// token

                tokenstream.reset();// ?

                while (tokenstream.incrementToken()) {// ??token
                    String prTxt = new String(termAttribute.buffer(), 0, termAttribute.length());
                    //producer.send(new KeyedMessage<Integer, String>(topic, ptTxt + " "));
                    System.out.print(prTxt + "  ");
                }
                //System.out.println();
                tokenstream.close();//TokenStream
            } catch (IOException e) {
                e.printStackTrace();
            }
            //-------------------------------------------
            producer.send(new KeyedMessage<Integer, String>(topic, status.toString()));
            Log.logInfo(status.toString());

        } catch (WeiboException e) {
            e.printStackTrace();
        }
    }
}

From source file:lia.analysis.CopyOfAnalyzerDemo.java

License:Apache License

private static void analyze(String text) throws IOException {
    System.out.println("Analyzing \"" + text + "\"");
    for (Analyzer analyzer : analyzers) {
        String name = analyzer.getClass().getSimpleName();
        System.out.println(name);
        TokenStream stream = analyzer.tokenStream("dummy", text);
        stream.reset();/* ww  w  . j  av a2 s  . c o m*/

        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);
        PositionIncrementAttribute positionAttr = stream.addAttribute(PositionIncrementAttribute.class);

        while (stream.incrementToken()) {

            System.out.print("[" + termAttr + "] ");
        }
        System.out.println("");
    }
}

From source file:lia.analysis.i18n.ChineseDemo.java

License:Apache License

private static void analyze(String string, Analyzer analyzer) throws IOException {
    StringBuffer buffer = new StringBuffer();

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
    TermAttribute term = stream.addAttribute(TermAttribute.class);

    while (stream.incrementToken()) { //C
        buffer.append("[");
        buffer.append(term.term());//from w ww  . j  a v a2s.  co m
        buffer.append("] ");
    }

    String output = buffer.toString();

    Frame f = new Frame();
    f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
    f.setResizable(true);

    Font font = new Font(null, Font.PLAIN, 36);
    int width = getWidth(f.getFontMetrics(font), output);

    f.setSize((width < 250) ? 250 : width + 50, 75);

    // NOTE: if Label doesn't render the Chinese characters
    // properly, try using javax.swing.JLabel instead
    JLabel label = new JLabel(output); //D
    label.setSize(width, 75);
    //label.setAlignment(JLabel.CENTER);
    label.setFont(font);
    f.add(label);

    f.setVisible(true);
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    TermToBytesRefAttribute term = stream.addAttribute(TermToBytesRefAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.getBytesRef().utf8ToString() + "] "); //B
    }//ww w. jav a2s .co m
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    TermToBytesRefAttribute term = stream.addAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }//w  w  w.java2 s  .  c  o  m

        System.out.print("[" + term.getBytesRef().utf8ToString() + "] ");
    }
    System.out.println();
}