List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.elasticsearch.analysis.common.SynonymsAnalysisTests.java
License:Apache License
private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); TokenStream stream = analyzer.tokenStream("", source); stream.reset();/* w ww . j ava 2 s . c om*/ CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (stream.incrementToken()) { sb.append(termAtt.toString()).append(" "); } MatcherAssert.assertThat(target, equalTo(sb.toString().trim())); }
From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java
License:Apache License
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override/*from w ww. j a va2s . c o m*/ protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java
License:Apache License
@Test public void testBoostOnEagerTokenizer() throws Exception { AllEntries allEntries = new AllEntries(); allEntries.addText("field1", "all", 2.0f); allEntries.addText("field2", "your", 1.0f); allEntries.addText("field1", "boosts", 0.5f); allEntries.reset();//from w w w . j a v a2s . c o m // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries, new WhitespaceAnalyzer(Lucene.VERSION)); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); ts.reset(); for (int i = 0; i < 3; ++i) { assertTrue(ts.incrementToken()); final String term; final float boost; switch (i) { case 0: term = "all"; boost = 2; break; case 1: term = "your"; boost = 1; break; case 2: term = "boosts"; boost = 0.5f; break; default: throw new AssertionError(); } assertEquals(term, termAtt.toString()); final BytesRef payload = payloadAtt.getPayload(); if (payload == null || payload.length == 0) { assertEquals(boost, 1f, 0.001f); } else { assertEquals(4, payload.length); final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset); assertEquals(boost, b, 0.001f); } } assertFalse(ts.incrementToken()); }
From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java
License:Apache License
@Override protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException { // luckily this is single thread access and we dont need a thread local. hasDocValsNow = false;// w w w . j a v a 2 s . c om super.parseCreateField(context, fields); hasDocValsNow = true; String value = null; if (context.externalValueSet()) { value = (String) context.externalValue(); } else { for (Field f : fields) { Class<?> fClass = f.getClass(); if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) { value = f.stringValue(); break; } } } if (value != null) { TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String token = cattr.toString(); // take the first token and make it a doc value fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token))); break; } stream.end(); stream.close(); } }
From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java
License:Apache License
public void testConfigureCamelCaseTokenFilter() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter") .build();/*from w w w .java 2s . c o m*/ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()) .getAnalysisRegistry().build(idxSettings); try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 2, token.size()); assertEquals("j2se", token.get(0)); assertEquals("j2ee", token.get(1)); } try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 6, token.size()); assertEquals("j", token.get(0)); assertEquals("2", token.get(1)); assertEquals("se", token.get(2)); assertEquals("j", token.get(3)); assertEquals("2", token.get(4)); assertEquals("ee", token.get(5)); } }
From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java
License:Apache License
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); Injector injector = new ModulesBuilder() .add(new IndexSettingsModule(index, settings), new IndexNameModule(index), new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))) .createChildInjector(parentInjector); AnalysisService analysisService = injector.getInstance(AnalysisService.class); Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field1", text, 1.0f); allEntries.reset();// www .j av a 2 s . c o m TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<String>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }
From source file:org.elasticsearch.index.analysis.CustomWBAnalysisTests.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset();/*from w ww . j a va 2 s . c o m*/ CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertThat(termAttr, notNullValue()); int i = 0; while (stream.incrementToken()) { assertThat(expected.length, greaterThan(i)); assertThat("expected different term at index " + i, expected[i++], equalTo(termAttr.toString())); } assertThat("not all tokens produced", i, equalTo(expected.length)); }
From source file:org.elasticsearch.index.analysis.morphology.SimpleMorphologyAnalysisTests.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset();// w ww .j a va 2s . c o m CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { Assert.assertTrue("got extra term: " + termAttr.toString(), i < expected.length); Assert.assertEquals("expected different term at index " + i, termAttr.toString(), expected[i]); i++; } Assert.assertEquals("not all tokens produced", i, expected.length); }
From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java
License:Apache License
@Test public void testAttributeEqual() throws IOException { final int precisionStep = 8; final double value = randomDouble(); NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep); final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value)); final NumericTokenStream ts2 = new NumericTokenStream(precisionStep); ts2.setDoubleValue(value);/*from w w w . j a va 2 s.co m*/ final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class); final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class); final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class); final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class); ts1.reset(); ts2.reset(); while (ts1.incrementToken()) { assertThat(ts2.incrementToken(), is(true)); assertThat(posInc1, equalTo(posInc2)); // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070) assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue())); assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift())); } assertThat(ts2.incrementToken(), is(false)); ts1.end(); ts2.end(); }
From source file:org.elasticsearch.index.analysis.PaodingAnalysisTests.java
License:Apache License
public List getname(String param) throws IOException { System.setProperty("paoding.dic.home.config-first", "D:/Projects/Java Related/ElasticSearch/plugins/elasticsearch-analysis-paoding/config/paoding/dic"); //?(??)//from w w w. j a va2s . c o m Analyzer ika = new PaodingAnalyzer(); List<String> keys = new ArrayList<String>(); TokenStream ts = null; try { Reader r = new StringReader(param); ts = ika.tokenStream("TestField", r); CharTermAttribute termAtt = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); String key = null; while (ts.incrementToken()) { if ("word".equals(typeAtt.type())) { key = termAtt.toString(); if (key.length() >= 2) { keys.add(key); } } } } catch (IOException e) { e.printStackTrace(); } finally { if (ts != null) { ts.close(); } } Map<String, Integer> keyMap = new HashMap<String, Integer>(); Integer $ = null; //?? for (String key : keys) { keyMap.put(key, ($ = keyMap.get(key)) == null ? 1 : $ + 1); } List<Map.Entry<String, Integer>> keyList = new ArrayList<Map.Entry<String, Integer>>(keyMap.entrySet()); //? Collections.sort(keyList, new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { return (o2.getValue() - o1.getValue()); } }); //?? String id = null; String str = ""; List list = new ArrayList(); if (keyList.size() > 0) { for (int i = 0; i < keyList.size(); i++) { id = keyList.get(i).toString(); String[] strs = id.split("\\="); str = strs[0]; list.add(strs[0]); System.out.println("id:" + id); } } return list; }