List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.elasticsearch.index.analysis.split.AnalysisTests.java
License:Apache License
@Test public void testTokenFilter() throws IOException { String[] strings = new String[] { "<em>abc</em>def", "<em>this is just a NGram</em>abc<em>def</em>This is", "? dong ai hua <em>just</em> the NGram", "xsflsy02.sa.nhnsystem.com", "nomad::Job::ReturnAnswer:163", "2013-01-10 06:29:07 +0000", "<123>456", "<em>NNB</em>" + "=ULPUSFCVXBNFC " + "NB=GYYDSNJYGIYTEMBW npic=" + "<em>SFCuenZVHbx0RFZFoh+a0WALs7qRAYM/3vD26gfSTs4O8u/7rIqsl9I5OnJV9LgnCA</em> " + "page_uid=RT+2zc5Y7tlssvof8wCsssssstZ-140745 " + "BMR= nid_inf=438366115 NID_MATCH_M=1 " + "NID_AUT=<em>95R9DDUsQ6SpQrid2Qpfe0s5BsyH6VRO0jBmpZ/Nmq4TrgPddxY8gUzhTVFyhECwFBBH6tnpd8YslNUK+ARdKEOJSwxM7HOspmslEoVHHHDgTqdfF60lI8opO9JKWVFaAnswVnIFNHTdHUdaCeFvSQ<em> " + "NID_SES=<em>AAABRQjdQk1opAW5ceebr50CEmXN7HrzMrImW4FrXlJACJ1QU2fYDyjIpO/cO/2/+iM0BwZTnLgX4EClbkFwar9MJDr/+0dfX91dMvXV+8WuyiCiWxCWd4FwrsCMHcXthwQGV+C1bCrbU+5C/qeOeGuJCGwVt769y8+Tuy+KBuTGbKDMuUF/SyRq5IwNQ3YL1pMGs+cAnFN2xqFplgJtZvlhI8+8f3GfMxZqEHlXmSSlSpCWkZZYzz9wx2WarU+WtU4WGpnW0Y+Kc347mW2mNaVIDq+AHf4HXE8JHsPqvlzNWlkyS5AHw3tc5bWFy0MhxngOnyG7VqTheb4yxPRhTY0D6fF4TDPr7fjsJ5tuA9oxH+BGuoy6uYIs8uoRI1+HULgI0WCQpiNeVtI1eskacsENBnqECJ3OOyAFzAcr9msv7pr8LYtx0TsNVlLWVS7ug1uH5w</em> " + "ncvid=#vid#_118.217.45.216p3Lj WMONID=DEZg20K2BGS ncvc2=7c1ce4c094a2a31133c5b88ab14e2e56eda35ebba8bf21da60ba865aeeca2ee728d016cd172bbf93e37c2bf73b9136e8073a1f11e2d0ab9cf43394518fbf0ec3adaba8a9b6abb4aba4a0a3a4a1a6b615 nci4=0337dafeeaa7c87a25cb8c9b96771b78d997768ada8665b7478abf4dfaff3ac3c336f650f4ba5c697e8fb3613570e67cd88ff44bafb0f9e0ca00aa61b78337fa95b1bc9bba8bb9b7b691b485cdbeae8da997b3aba285a091e6919cbc98a9ea9c93b78ebff2838aad88b9878b82a580ce8083848988888b8cb9 JSESSIONID=E365D0634FED26492BFFD5DEEE789B66 personaconmain|ektmfrl645=AE8BC98FD74D619FF7B13C83191E1F5EAFCD0F25C43D6BDC693E26D777419A2F845E79DA02B04219 personacon|ektmfrl645= cafeCookieToken=5KCBru-K8k8aHwkbio4dPmLlMyK6WlPYqN0319U4UeImDS9UVPpo70IVLHK9eybq6eJc-rNfllMgB5Fk_i2j-rKM1mCuoOqZ ncu=82b94171693746ae8766724d5696dc1a83e17aed" };/*from w ww . j ava 2s. c o m*/ String[] expected = new String[] { "<em>abc</em><i>def</i>", "<em>this is just a NGram</em><i>abc</i><em>def</em>This is", "<i></i><i></i><i>?</i> <i>dong</i> <i>ai</i> <i>hua</i> <em>just</em> the <i>NGram</i>", "<i>xsflsy02</i>.<i>sa.nhnsystem.com</i>", "<i>nomad</i>::<i>Job</i>::<i>ReturnAnswer</i>:<i>163</i>", "<i>2013</i>-<i>01</i>-<i>10</i> <i>06</i>:<i>29</i>:<i>07</i> +<i>0000</i>", "<<i>123</i>><i>456</i>", "<em>NNB</em>=<i>ULPUSFCVXBNFC</i> <i>NB</i>=<i>GYYDSNJYGIYTEMBW</i> <i>npic</i>=<em>SFCuenZVHbx0RFZFoh+a0WALs7qRAYM/3vD26gfSTs4O8u/7rIqsl9I5OnJV9LgnCA</em> <i>page_uid</i>=<i>RT</i>+<i>2zc5Y7tlssvof8wCsssssstZ</i>-<i>140745</i> <i>BMR</i>= <i>nid_inf</i>=<i>438366115</i> <i>NID_MATCH_M</i>=<i>1</i> <i>NID_AUT</i>= <i>ncvid</i>=#<i>vid</i>#<i>_118.217.45.216p3Lj</i> <i>WMONID</i>=<i>DEZg20K2BGS</i> <i>ncvc2</i>=<i>7c1ce4c094a2a31133c5b88ab14e2e56eda35ebba8bf21da60ba865aeeca2ee728d016cd172bbf93e37c2bf73b9136e8073a1f11e2d0ab9cf43394518fbf0ec3adaba8a9b6abb4aba4a0a3a4a1a6b615</i> <i>nci4</i>=<i>0337dafeeaa7c87a25cb8c9b96771b78d997768ada8665b7478abf4dfaff3ac3c336f650f4ba5c697e8fb3613570e67cd88ff44bafb0f9e0ca00aa61b78337fa95b1bc9bba8bb9b7b691b485cdbeae8da997b3aba285a091e6919cbc98a9ea9c93b78ebff2838aad88b9878b82a580ce8083848988888b8cb9</i> <i>JSESSIONID</i>=<i>E365D0634FED26492BFFD5DEEE789B66</i> <i>personaconmain</i>|<i>ektmfrl645</i>=<i>AE8BC98FD74D619FF7B13C83191E1F5EAFCD0F25C43D6BDC693E26D777419A2F845E79DA02B04219</i> <i>personacon</i>|<i>ektmfrl645</i>= <i>cafeCookieToken</i>=<i>5KCBru</i>-<i>K8k8aHwkbio4dPmLlMyK6WlPYqN0319U4UeImDS9UVPpo70IVLHK9eybq6eJc</i>-<i>rNfllMgB5Fk_i2j</i>-<i>rKM1mCuoOqZ</i> <i>ncu</i>=<i>82b94171693746ae8766724d5696dc1a83e17aed</i>" }; Analyzer analyzer = new SplitAnalyzer(Lucene.ANALYZER_VERSION); for (int i = 0, len = strings.length; i < len; i++) { StringReader sr = new StringReader(strings[i]); TokenStream stream = analyzer.tokenStream("f", sr); stream.reset(); List<String> list = new ArrayList<String>(); while (stream.incrementToken()) { CharTermAttribute ta = stream.getAttribute(CharTermAttribute.class); list.add(ta.toString()); System.out.println(ta.toString()); } Joiner joiner = Joiner.on(""); System.out.println("Result:" + joiner.join(list)); Assert.assertEquals(joiner.join(list), expected[i]); } }
From source file:org.elasticsearch.index.analysis.synonyms.SynonymsAnalysisTest.java
License:Apache License
private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field", source, 1.0f); allEntries.reset();//from w w w.j a va 2 s . com TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (stream.incrementToken()) { sb.append(termAtt.toString()).append(" "); } MatcherAssert.assertThat(target, equalTo(sb.toString().trim())); }
From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java
License:Apache License
/** * Count position increments in a token stream. Package private for testing. * @param tokenStream token stream to count * @return number of position increments in a token stream * @throws IOException if tokenStream throws it *//*from w w w . j ava 2 s. c o m*/ static int countPositions(TokenStream tokenStream) throws IOException { try { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { count += position.getPositionIncrement(); } tokenStream.end(); count += position.getPositionIncrement(); return count; } finally { tokenStream.close(); } }
From source file:org.elasticsearch.index.mapper.date.LegacyDateMappingTests.java
License:Apache License
private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA, String fieldB) throws IOException { assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.mappers().indexAnalyzer(), null), notNullValue());// w ww . j a v a 2 s .c o m assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null), notNullValue()); TokenStream tokenStream = doc.rootDoc().getField(fieldA) .tokenStream(defaultMapper.mappers().indexAnalyzer(), null); tokenStream.reset(); LegacyNumericTermAttribute nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class); List<Long> values = new ArrayList<>(); while (tokenStream.incrementToken()) { values.add(nta.getRawValue()); } tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null); tokenStream.reset(); nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class); int pos = 0; while (tokenStream.incrementToken()) { assertThat(values.get(pos++), equalTo(nta.getRawValue())); } assertThat(pos, equalTo(values.size())); }
From source file:org.elasticsearch.index.mapper.date.SimpleDateMappingTests.java
License:Apache License
private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA, String fieldB) throws IOException { assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()), notNullValue()); assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()), notNullValue()); TokenStream tokenStream = doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.indexAnalyzer()); tokenStream.reset(); NumericTermAttribute nta = tokenStream.addAttribute(NumericTermAttribute.class); List<Long> values = new ArrayList<Long>(); while (tokenStream.incrementToken()) { values.add(nta.getRawValue());//from ww w. j a va 2 s . co m } tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.indexAnalyzer()); tokenStream.reset(); nta = tokenStream.addAttribute(NumericTermAttribute.class); int pos = 0; while (tokenStream.incrementToken()) { assertThat(values.get(pos++), equalTo(nta.getRawValue())); } assertThat(pos, equalTo(values.size())); }
From source file:org.elasticsearch.index.mapper.FeatureFieldMapperTests.java
License:Apache License
static int getFrequency(TokenStream tk) throws IOException { TermFrequencyAttribute freqAttribute = tk.addAttribute(TermFrequencyAttribute.class); tk.reset(); assertTrue(tk.incrementToken());/*from w w w.j a v a 2s .co m*/ int freq = freqAttribute.getTermFrequency(); assertFalse(tk.incrementToken()); return freq; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Query fieldQuery(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {//from w w w . j a va2s.com tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanQuery q = new BooleanQuery(); try { while (tok.incrementToken()) { Term term = names().createIndexNameTerm(termAtt.toString()); q.add(new TermQuery(term), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); q = null; } return q; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Filter fieldFilter(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {/*from ww w .j a va 2s . c o m*/ tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanFilter f = new BooleanFilter(); try { while (tok.incrementToken()) { Term term = names().createIndexNameTerm(termAtt.toString()); f.add(new TermFilter(term), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); f = null; } return f; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {// w ww. j a va 2s. c om tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanQuery q = new BooleanQuery(); try { int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included while (tok.incrementToken()) { Term term = names().createIndexNameTerm(termAtt.toString()); if (termAtt.length() < 1 + chunkLength) { if (remainingSize > 0) { // implies size is fixed if (remainingSize < chunkLength) q.add(new PrefixLengthQuery(term, 1 + remainingSize, 1 + remainingSize), BooleanClause.Occur.MUST); else q.add(new PrefixLengthQuery(term, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.MUST); } else { // varying size: only limit to the chunkLength q.add(new PrefixLengthQuery(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST); } } else { q.add(new TermQuery(term), BooleanClause.Occur.MUST); } remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); q = null; } return q; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Filter prefixFilter(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real filter TokenStream tok = null; try {/*from w w w .j ava 2 s . c o m*/ tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanFilter f = new BooleanFilter(); try { int remainingSize = sizeIsVariable ? 0 : sizeValue; // note: prefixes are not included while (tok.incrementToken()) { Term term = names().createIndexNameTerm(termAtt.toString()); if (termAtt.length() < 1 + chunkLength) { if (remainingSize > 0) { // implies size is fixed if (remainingSize < chunkLength) f.add(new PrefixLengthFilter(term, 1 + remainingSize, 1 + remainingSize), BooleanClause.Occur.MUST); else f.add(new PrefixLengthFilter(term, 1 + chunkLength, 1 + chunkLength), BooleanClause.Occur.MUST); } else { // varying size: only limit to the chunkLength f.add(new PrefixLengthFilter(term, 0, 1 + chunkLength), BooleanClause.Occur.MUST); } } else { f.add(new TermFilter(term), BooleanClause.Occur.MUST); } remainingSize -= termAtt.length() - 1; // termAtt contains the prefix, remainingSize doesn't take it into account } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); f = null; } return f; }