List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.elasticsearch.analysis.common.CompoundAnalysisTests.java
License:Apache License
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); AnalysisModule analysisModule = createAnalysisModule(settings); IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings); Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); TokenStream stream = analyzer.tokenStream("", text); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText);/*from ww w. j a v a 2 s . co m*/ } return terms; }
From source file:org.elasticsearch.analysis.common.SynonymsAnalysisTests.java
License:Apache License
private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); TokenStream stream = analyzer.tokenStream("", source); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (stream.incrementToken()) { sb.append(termAtt.toString()).append(" "); }// ww w . ja v a 2 s .co m MatcherAssert.assertThat(target, equalTo(sb.toString().trim())); }
From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java
License:Apache License
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override// ww w. ja v a 2 s .c o m protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java
License:Apache License
@Test public void testBoostOnEagerTokenizer() throws Exception { AllEntries allEntries = new AllEntries(); allEntries.addText("field1", "all", 2.0f); allEntries.addText("field2", "your", 1.0f); allEntries.addText("field1", "boosts", 0.5f); allEntries.reset();//from ww w .ja va 2 s .c o m // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries, new WhitespaceAnalyzer(Lucene.VERSION)); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); ts.reset(); for (int i = 0; i < 3; ++i) { assertTrue(ts.incrementToken()); final String term; final float boost; switch (i) { case 0: term = "all"; boost = 2; break; case 1: term = "your"; boost = 1; break; case 2: term = "boosts"; boost = 0.5f; break; default: throw new AssertionError(); } assertEquals(term, termAtt.toString()); final BytesRef payload = payloadAtt.getPayload(); if (payload == null || payload.length == 0) { assertEquals(boost, 1f, 0.001f); } else { assertEquals(4, payload.length); final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset); assertEquals(boost, b, 0.001f); } } assertFalse(ts.incrementToken()); }
From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java
License:Apache License
@Override protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException { // luckily this is single thread access and we dont need a thread local. hasDocValsNow = false;/*ww w .j a v a2s .co m*/ super.parseCreateField(context, fields); hasDocValsNow = true; String value = null; if (context.externalValueSet()) { value = (String) context.externalValue(); } else { for (Field f : fields) { Class<?> fClass = f.getClass(); if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) { value = f.stringValue(); break; } } } if (value != null) { TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String token = cattr.toString(); // take the first token and make it a doc value fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token))); break; } stream.end(); stream.close(); } }
From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java
License:Apache License
public void testConfigureCamelCaseTokenFilter() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter") .build();/*from w w w . j a v a 2 s . c om*/ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()) .getAnalysisRegistry().build(idxSettings); try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 2, token.size()); assertEquals("j2se", token.get(0)); assertEquals("j2ee", token.get(1)); } try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 6, token.size()); assertEquals("j", token.get(0)); assertEquals("2", token.get(1)); assertEquals("se", token.get(2)); assertEquals("j", token.get(3)); assertEquals("2", token.get(4)); assertEquals("ee", token.get(5)); } }
From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java
License:Apache License
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); Injector injector = new ModulesBuilder() .add(new IndexSettingsModule(index, settings), new IndexNameModule(index), new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))) .createChildInjector(parentInjector); AnalysisService analysisService = injector.getInstance(AnalysisService.class); Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field1", text, 1.0f); allEntries.reset();/* ww w . j a va 2 s . c o m*/ TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<String>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }
From source file:org.elasticsearch.index.analysis.CustomWBAnalysisTests.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertThat(termAttr, notNullValue()); int i = 0;/*from w w w . java 2 s .com*/ while (stream.incrementToken()) { assertThat(expected.length, greaterThan(i)); assertThat("expected different term at index " + i, expected[i++], equalTo(termAttr.toString())); } assertThat("not all tokens produced", i, equalTo(expected.length)); }
From source file:org.elasticsearch.index.analysis.morphology.SimpleMorphologyAnalysisTests.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr);/*ww w .j a v a 2s .c o m*/ int i = 0; while (stream.incrementToken()) { Assert.assertTrue("got extra term: " + termAttr.toString(), i < expected.length); Assert.assertEquals("expected different term at index " + i, termAttr.toString(), expected[i]); i++; } Assert.assertEquals("not all tokens produced", i, expected.length); }
From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java
License:Apache License
@Test public void testAttributeEqual() throws IOException { final int precisionStep = 8; final double value = randomDouble(); NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep); final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value)); final NumericTokenStream ts2 = new NumericTokenStream(precisionStep); ts2.setDoubleValue(value);/*from w ww .jav a 2s.c o m*/ final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class); final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class); final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class); final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class); ts1.reset(); ts2.reset(); while (ts1.incrementToken()) { assertThat(ts2.incrementToken(), is(true)); assertThat(posInc1, equalTo(posInc2)); // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070) assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue())); assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift())); } assertThat(ts2.incrementToken(), is(false)); ts1.end(); ts2.end(); }