Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.elasticsearch.index.analysis.split.AnalysisTests.java

License:Apache License

@Test
public void testTokenFilter() throws IOException {
    String[] strings = new String[] { "<em>abc</em>def", "<em>this is just a NGram</em>abc<em>def</em>This is",
            "? dong ai hua <em>just</em> the NGram", "xsflsy02.sa.nhnsystem.com",
            "nomad::Job::ReturnAnswer:163", "2013-01-10 06:29:07 +0000", "<123>456",
            "<em>NNB</em>" + "=ULPUSFCVXBNFC " + "NB=GYYDSNJYGIYTEMBW npic="
                    + "<em>SFCuenZVHbx0RFZFoh+a0WALs7qRAYM/3vD26gfSTs4O8u/7rIqsl9I5OnJV9LgnCA</em> "
                    + "page_uid=RT+2zc5Y7tlssvof8wCsssssstZ-140745 " + "BMR= nid_inf=438366115 NID_MATCH_M=1 "
                    + "NID_AUT=<em>95R9DDUsQ6SpQrid2Qpfe0s5BsyH6VRO0jBmpZ/Nmq4TrgPddxY8gUzhTVFyhECwFBBH6tnpd8YslNUK+ARdKEOJSwxM7HOspmslEoVHHHDgTqdfF60lI8opO9JKWVFaAnswVnIFNHTdHUdaCeFvSQ<em> "
                    + "NID_SES=<em>AAABRQjdQk1opAW5ceebr50CEmXN7HrzMrImW4FrXlJACJ1QU2fYDyjIpO/cO/2/+iM0BwZTnLgX4EClbkFwar9MJDr/+0dfX91dMvXV+8WuyiCiWxCWd4FwrsCMHcXthwQGV+C1bCrbU+5C/qeOeGuJCGwVt769y8+Tuy+KBuTGbKDMuUF/SyRq5IwNQ3YL1pMGs+cAnFN2xqFplgJtZvlhI8+8f3GfMxZqEHlXmSSlSpCWkZZYzz9wx2WarU+WtU4WGpnW0Y+Kc347mW2mNaVIDq+AHf4HXE8JHsPqvlzNWlkyS5AHw3tc5bWFy0MhxngOnyG7VqTheb4yxPRhTY0D6fF4TDPr7fjsJ5tuA9oxH+BGuoy6uYIs8uoRI1+HULgI0WCQpiNeVtI1eskacsENBnqECJ3OOyAFzAcr9msv7pr8LYtx0TsNVlLWVS7ug1uH5w</em> "
                    + "ncvid=#vid#_118.217.45.216p3Lj WMONID=DEZg20K2BGS ncvc2=7c1ce4c094a2a31133c5b88ab14e2e56eda35ebba8bf21da60ba865aeeca2ee728d016cd172bbf93e37c2bf73b9136e8073a1f11e2d0ab9cf43394518fbf0ec3adaba8a9b6abb4aba4a0a3a4a1a6b615 nci4=0337dafeeaa7c87a25cb8c9b96771b78d997768ada8665b7478abf4dfaff3ac3c336f650f4ba5c697e8fb3613570e67cd88ff44bafb0f9e0ca00aa61b78337fa95b1bc9bba8bb9b7b691b485cdbeae8da997b3aba285a091e6919cbc98a9ea9c93b78ebff2838aad88b9878b82a580ce8083848988888b8cb9 JSESSIONID=E365D0634FED26492BFFD5DEEE789B66 personaconmain|ektmfrl645=AE8BC98FD74D619FF7B13C83191E1F5EAFCD0F25C43D6BDC693E26D777419A2F845E79DA02B04219 personacon|ektmfrl645= cafeCookieToken=5KCBru-K8k8aHwkbio4dPmLlMyK6WlPYqN0319U4UeImDS9UVPpo70IVLHK9eybq6eJc-rNfllMgB5Fk_i2j-rKM1mCuoOqZ ncu=82b94171693746ae8766724d5696dc1a83e17aed"

    };/*from w ww . j  ava 2s. c o m*/
    String[] expected = new String[] { "<em>abc</em><i>def</i>",
            "<em>this is just a NGram</em><i>abc</i><em>def</em>This is",
            "<i></i><i></i><i>?</i> <i>dong</i> <i>ai</i> <i>hua</i> <em>just</em> the <i>NGram</i>",
            "<i>xsflsy02</i>.<i>sa.nhnsystem.com</i>",
            "<i>nomad</i>::<i>Job</i>::<i>ReturnAnswer</i>:<i>163</i>",
            "<i>2013</i>-<i>01</i>-<i>10</i> <i>06</i>:<i>29</i>:<i>07</i> +<i>0000</i>",
            "&lt;<i>123</i>&gt;<i>456</i>",
            "<em>NNB</em>=<i>ULPUSFCVXBNFC</i> <i>NB</i>=<i>GYYDSNJYGIYTEMBW</i> <i>npic</i>=<em>SFCuenZVHbx0RFZFoh+a0WALs7qRAYM/3vD26gfSTs4O8u/7rIqsl9I5OnJV9LgnCA</em> <i>page_uid</i>=<i>RT</i>+<i>2zc5Y7tlssvof8wCsssssstZ</i>-<i>140745</i> <i>BMR</i>= <i>nid_inf</i>=<i>438366115</i> <i>NID_MATCH_M</i>=<i>1</i> <i>NID_AUT</i>= <i>ncvid</i>=#<i>vid</i>#<i>_118.217.45.216p3Lj</i> <i>WMONID</i>=<i>DEZg20K2BGS</i> <i>ncvc2</i>=<i>7c1ce4c094a2a31133c5b88ab14e2e56eda35ebba8bf21da60ba865aeeca2ee728d016cd172bbf93e37c2bf73b9136e8073a1f11e2d0ab9cf43394518fbf0ec3adaba8a9b6abb4aba4a0a3a4a1a6b615</i> <i>nci4</i>=<i>0337dafeeaa7c87a25cb8c9b96771b78d997768ada8665b7478abf4dfaff3ac3c336f650f4ba5c697e8fb3613570e67cd88ff44bafb0f9e0ca00aa61b78337fa95b1bc9bba8bb9b7b691b485cdbeae8da997b3aba285a091e6919cbc98a9ea9c93b78ebff2838aad88b9878b82a580ce8083848988888b8cb9</i> <i>JSESSIONID</i>=<i>E365D0634FED26492BFFD5DEEE789B66</i> <i>personaconmain</i>|<i>ektmfrl645</i>=<i>AE8BC98FD74D619FF7B13C83191E1F5EAFCD0F25C43D6BDC693E26D777419A2F845E79DA02B04219</i> <i>personacon</i>|<i>ektmfrl645</i>= <i>cafeCookieToken</i>=<i>5KCBru</i>-<i>K8k8aHwkbio4dPmLlMyK6WlPYqN0319U4UeImDS9UVPpo70IVLHK9eybq6eJc</i>-<i>rNfllMgB5Fk_i2j</i>-<i>rKM1mCuoOqZ</i> <i>ncu</i>=<i>82b94171693746ae8766724d5696dc1a83e17aed</i>"

    };

    Analyzer analyzer = new SplitAnalyzer(Lucene.ANALYZER_VERSION);

    for (int i = 0, len = strings.length; i < len; i++) {
        StringReader sr = new StringReader(strings[i]);
        TokenStream stream = analyzer.tokenStream("f", sr);
        stream.reset();
        List<String> list = new ArrayList<String>();
        while (stream.incrementToken()) {
            CharTermAttribute ta = stream.getAttribute(CharTermAttribute.class);
            list.add(ta.toString());
            System.out.println(ta.toString());
        }
        Joiner joiner = Joiner.on("");
        System.out.println("Result:" + joiner.join(list));
        Assert.assertEquals(joiner.join(list), expected[i]);
    }
}

From source file:org.elasticsearch.index.analysis.synonyms.SynonymsAnalysisTest.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field", source, 1.0f);
    allEntries.reset();//from w w  w.j a va 2 s  .  com

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java

License:Apache License

/**
 * Count position increments in a token stream.  Package private for testing.
 * @param tokenStream token stream to count
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 *//*from   w  w w . j ava 2  s.  c  o m*/
static int countPositions(TokenStream tokenStream) throws IOException {
    try {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.mapper.date.LegacyDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());// w ww  . j  a  v a  2 s .c  o  m
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA)
            .tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    LegacyNumericTermAttribute nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    List<Long> values = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}

From source file:org.elasticsearch.index.mapper.date.SimpleDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()), notNullValue());
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()), notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer());
    tokenStream.reset();
    NumericTermAttribute nta = tokenStream.addAttribute(NumericTermAttribute.class);
    List<Long> values = new ArrayList<Long>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());//from ww w. j a  va 2 s .  co m
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer());
    tokenStream.reset();
    nta = tokenStream.addAttribute(NumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}

From source file:org.elasticsearch.index.mapper.FeatureFieldMapperTests.java

License:Apache License

static int getFrequency(TokenStream tk) throws IOException {
    TermFrequencyAttribute freqAttribute = tk.addAttribute(TermFrequencyAttribute.class);
    tk.reset();
    assertTrue(tk.incrementToken());/*from   w w  w.j a  v a 2s .co  m*/
    int freq = freqAttribute.getTermFrequency();
    assertFalse(tk.incrementToken());
    return freq;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query fieldQuery(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {//from w w w  .  j a va2s.com
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            q.add(new TermQuery(term), BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter fieldFilter(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {/*from  ww  w  .j a va 2s . c  o  m*/
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanFilter f = new BooleanFilter();
    try {
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            f.add(new TermFilter(term), BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        f = null;
    }
    return f;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method,
        @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {//  w  ww.  j  a va  2s.  c om
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            if (termAtt.length() < 1 + chunkLength) {
                if (remainingSize > 0) { // implies size is fixed
                    if (remainingSize < chunkLength)
                        q.add(new PrefixLengthQuery(term, 1 + remainingSize, 1 + remainingSize),
                                BooleanClause.Occur.MUST);
                    else
                        q.add(new PrefixLengthQuery(term, 1 + chunkLength, 1 + chunkLength),
                                BooleanClause.Occur.MUST);
                } else { // varying size: only limit to the chunkLength
                    q.add(new PrefixLengthQuery(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST);
                }
            } else {
                q.add(new TermQuery(term), BooleanClause.Occur.MUST);
            }
            remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter prefixFilter(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real filter
    TokenStream tok = null;
    try {/*from   w  w  w .j  ava 2  s  . c  o  m*/
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanFilter f = new BooleanFilter();
    try {
        int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            if (termAtt.length() < 1 + chunkLength) {
                if (remainingSize > 0) { // implies size is fixed
                    if (remainingSize < chunkLength)
                        f.add(new PrefixLengthFilter(term, 1 + remainingSize, 1 + remainingSize),
                                BooleanClause.Occur.MUST);
                    else
                        f.add(new PrefixLengthFilter(term, 1 + chunkLength, 1 + chunkLength),
                                BooleanClause.Occur.MUST);
                } else { // varying size: only limit to the chunkLength
                    f.add(new PrefixLengthFilter(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST);
                }
            } else {
                f.add(new TermFilter(term), BooleanClause.Occur.MUST);
            }
            remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        f = null;
    }
    return f;
}