Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.elasticsearch.index.analysis.SmartChineseAnalysisTests.java

License:Apache License

/**
 * With the default analyzer or the default Chinese analyzer, you would get: 
 * { "", "", "", "", "" }.// w ww  .  jav a  2s  .com
 * <p>
 * But the SmartAnalyzer gets you the more desirable:
 * { "", "", "", "" }.
 * That is, "" (China) is one token.
 */
@Test
public void analyzeSomeChineseText() throws Exception {
    Index index = new Index("test");

    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS),
            new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule())
            .createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, EMPTY_SETTINGS), new IndexNameModule(index),
                    new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class))
                            .addProcessor(new SmartChineseAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer("smartcn").analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("message", "", 1.0f);
    allEntries.reset();

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.term();
        terms.add(tokText);
    }

    MatcherAssert.assertThat(terms.size(), equalTo(4));
    MatcherAssert.assertThat(terms, hasItems("", "", "", ""));
}

From source file:org.elasticsearch.index.analysis.split.AnalysisTests.java

License:Apache License

@Test
public void testTokenFilter() throws IOException {
    String[] strings = new String[] { "<em>abc</em>def", "<em>this is just a NGram</em>abc<em>def</em>This is",
            "? dong ai hua <em>just</em> the NGram", "xsflsy02.sa.nhnsystem.com",
            "nomad::Job::ReturnAnswer:163", "2013-01-10 06:29:07 +0000", "<123>456",
            "<em>NNB</em>" + "=ULPUSFCVXBNFC " + "NB=GYYDSNJYGIYTEMBW npic="
                    + "<em>SFCuenZVHbx0RFZFoh+a0WALs7qRAYM/3vD26gfSTs4O8u/7rIqsl9I5OnJV9LgnCA</em> "
                    + "page_uid=RT+2zc5Y7tlssvof8wCsssssstZ-140745 " + "BMR= nid_inf=438366115 NID_MATCH_M=1 "
                    + "NID_AUT=<em>95R9DDUsQ6SpQrid2Qpfe0s5BsyH6VRO0jBmpZ/Nmq4TrgPddxY8gUzhTVFyhECwFBBH6tnpd8YslNUK+ARdKEOJSwxM7HOspmslEoVHHHDgTqdfF60lI8opO9JKWVFaAnswVnIFNHTdHUdaCeFvSQ<em> "
                    + "NID_SES=<em>AAABRQjdQk1opAW5ceebr50CEmXN7HrzMrImW4FrXlJACJ1QU2fYDyjIpO/cO/2/+iM0BwZTnLgX4EClbkFwar9MJDr/+0dfX91dMvXV+8WuyiCiWxCWd4FwrsCMHcXthwQGV+C1bCrbU+5C/qeOeGuJCGwVt769y8+Tuy+KBuTGbKDMuUF/SyRq5IwNQ3YL1pMGs+cAnFN2xqFplgJtZvlhI8+8f3GfMxZqEHlXmSSlSpCWkZZYzz9wx2WarU+WtU4WGpnW0Y+Kc347mW2mNaVIDq+AHf4HXE8JHsPqvlzNWlkyS5AHw3tc5bWFy0MhxngOnyG7VqTheb4yxPRhTY0D6fF4TDPr7fjsJ5tuA9oxH+BGuoy6uYIs8uoRI1+HULgI0WCQpiNeVtI1eskacsENBnqECJ3OOyAFzAcr9msv7pr8LYtx0TsNVlLWVS7ug1uH5w</em> "
                    + "ncvid=#vid#_118.217.45.216p3Lj WMONID=DEZg20K2BGS ncvc2=7c1ce4c094a2a31133c5b88ab14e2e56eda35ebba8bf21da60ba865aeeca2ee728d016cd172bbf93e37c2bf73b9136e8073a1f11e2d0ab9cf43394518fbf0ec3adaba8a9b6abb4aba4a0a3a4a1a6b615 nci4=0337dafeeaa7c87a25cb8c9b96771b78d997768ada8665b7478abf4dfaff3ac3c336f650f4ba5c697e8fb3613570e67cd88ff44bafb0f9e0ca00aa61b78337fa95b1bc9bba8bb9b7b691b485cdbeae8da997b3aba285a091e6919cbc98a9ea9c93b78ebff2838aad88b9878b82a580ce8083848988888b8cb9 JSESSIONID=E365D0634FED26492BFFD5DEEE789B66 personaconmain|ektmfrl645=AE8BC98FD74D619FF7B13C83191E1F5EAFCD0F25C43D6BDC693E26D777419A2F845E79DA02B04219 personacon|ektmfrl645= cafeCookieToken=5KCBru-K8k8aHwkbio4dPmLlMyK6WlPYqN0319U4UeImDS9UVPpo70IVLHK9eybq6eJc-rNfllMgB5Fk_i2j-rKM1mCuoOqZ ncu=82b94171693746ae8766724d5696dc1a83e17aed"

    };//w  ww. j ava2s . co m
    String[] expected = new String[] { "<em>abc</em><i>def</i>",
            "<em>this is just a NGram</em><i>abc</i><em>def</em>This is",
            "<i></i><i></i><i>?</i> <i>dong</i> <i>ai</i> <i>hua</i> <em>just</em> the <i>NGram</i>",
            "<i>xsflsy02</i>.<i>sa.nhnsystem.com</i>",
            "<i>nomad</i>::<i>Job</i>::<i>ReturnAnswer</i>:<i>163</i>",
            "<i>2013</i>-<i>01</i>-<i>10</i> <i>06</i>:<i>29</i>:<i>07</i> +<i>0000</i>",
            "&lt;<i>123</i>&gt;<i>456</i>",
            "<em>NNB</em>=<i>ULPUSFCVXBNFC</i> <i>NB</i>=<i>GYYDSNJYGIYTEMBW</i> <i>npic</i>=<em>SFCuenZVHbx0RFZFoh+a0WALs7qRAYM/3vD26gfSTs4O8u/7rIqsl9I5OnJV9LgnCA</em> <i>page_uid</i>=<i>RT</i>+<i>2zc5Y7tlssvof8wCsssssstZ</i>-<i>140745</i> <i>BMR</i>= <i>nid_inf</i>=<i>438366115</i> <i>NID_MATCH_M</i>=<i>1</i> <i>NID_AUT</i>= <i>ncvid</i>=#<i>vid</i>#<i>_118.217.45.216p3Lj</i> <i>WMONID</i>=<i>DEZg20K2BGS</i> <i>ncvc2</i>=<i>7c1ce4c094a2a31133c5b88ab14e2e56eda35ebba8bf21da60ba865aeeca2ee728d016cd172bbf93e37c2bf73b9136e8073a1f11e2d0ab9cf43394518fbf0ec3adaba8a9b6abb4aba4a0a3a4a1a6b615</i> <i>nci4</i>=<i>0337dafeeaa7c87a25cb8c9b96771b78d997768ada8665b7478abf4dfaff3ac3c336f650f4ba5c697e8fb3613570e67cd88ff44bafb0f9e0ca00aa61b78337fa95b1bc9bba8bb9b7b691b485cdbeae8da997b3aba285a091e6919cbc98a9ea9c93b78ebff2838aad88b9878b82a580ce8083848988888b8cb9</i> <i>JSESSIONID</i>=<i>E365D0634FED26492BFFD5DEEE789B66</i> <i>personaconmain</i>|<i>ektmfrl645</i>=<i>AE8BC98FD74D619FF7B13C83191E1F5EAFCD0F25C43D6BDC693E26D777419A2F845E79DA02B04219</i> <i>personacon</i>|<i>ektmfrl645</i>= <i>cafeCookieToken</i>=<i>5KCBru</i>-<i>K8k8aHwkbio4dPmLlMyK6WlPYqN0319U4UeImDS9UVPpo70IVLHK9eybq6eJc</i>-<i>rNfllMgB5Fk_i2j</i>-<i>rKM1mCuoOqZ</i> <i>ncu</i>=<i>82b94171693746ae8766724d5696dc1a83e17aed</i>"

    };

    Analyzer analyzer = new SplitAnalyzer(Lucene.ANALYZER_VERSION);

    for (int i = 0, len = strings.length; i < len; i++) {
        StringReader sr = new StringReader(strings[i]);
        TokenStream stream = analyzer.tokenStream("f", sr);
        stream.reset();
        List<String> list = new ArrayList<String>();
        while (stream.incrementToken()) {
            CharTermAttribute ta = stream.getAttribute(CharTermAttribute.class);
            list.add(ta.toString());
            System.out.println(ta.toString());
        }
        Joiner joiner = Joiner.on("");
        System.out.println("Result:" + joiner.join(list));
        Assert.assertEquals(joiner.join(list), expected[i]);
    }
}

From source file:org.elasticsearch.index.analysis.synonyms.SynonymsAnalysisTest.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field", source, 1.0f);
    allEntries.reset();//ww  w . j  a  va 2  s . c o  m

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java

License:Apache License

/**
 * Count position increments in a token stream.  Package private for testing.
 * @param tokenStream token stream to count
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 *//*from   w ww .ja v a2 s  .co  m*/
static int countPositions(TokenStream tokenStream) throws IOException {
    try {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.mapper.date.LegacyDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());// w w w  .ja v  a2  s. c o m
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA)
            .tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    LegacyNumericTermAttribute nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    List<Long> values = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}

From source file:org.elasticsearch.index.mapper.date.SimpleDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()), notNullValue());
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()), notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer());
    tokenStream.reset();// w  ww  .  jav  a 2 s  . c om
    NumericTermAttribute nta = tokenStream.addAttribute(NumericTermAttribute.class);
    List<Long> values = new ArrayList<Long>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer());
    tokenStream.reset();
    nta = tokenStream.addAttribute(NumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}

From source file:org.elasticsearch.index.mapper.FeatureFieldMapperTests.java

License:Apache License

static int getFrequency(TokenStream tk) throws IOException {
    TermFrequencyAttribute freqAttribute = tk.addAttribute(TermFrequencyAttribute.class);
    tk.reset();/*from   www . j  a  v  a  2 s. co  m*/
    assertTrue(tk.incrementToken());
    int freq = freqAttribute.getTermFrequency();
    assertFalse(tk.incrementToken());
    return freq;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query fieldQuery(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {/*from   w w  w .j  a va 2 s .c o m*/
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            q.add(new TermQuery(term), BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter fieldFilter(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {//ww  w. j  av a 2  s.  c om
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanFilter f = new BooleanFilter();
    try {
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            f.add(new TermFilter(term), BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        f = null;
    }
    return f;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method,
        @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {//from  w w  w . j a va2s . c om
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            if (termAtt.length() < 1 + chunkLength) {
                if (remainingSize > 0) { // implies size is fixed
                    if (remainingSize < chunkLength)
                        q.add(new PrefixLengthQuery(term, 1 + remainingSize, 1 + remainingSize),
                                BooleanClause.Occur.MUST);
                    else
                        q.add(new PrefixLengthQuery(term, 1 + chunkLength, 1 + chunkLength),
                                BooleanClause.Occur.MUST);
                } else { // varying size: only limit to the chunkLength
                    q.add(new PrefixLengthQuery(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST);
                }
            } else {
                q.add(new TermQuery(term), BooleanClause.Occur.MUST);
            }
            remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}