List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java
License:Apache License
@Test public void testRSLPPhrases() throws Exception { Index index = new Index("test"); Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir()).put("index.analysis.analyzer.myAnalyzer.type", "custom") .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") .put("index.analysis.analyzer.myAnalyzer.filter", "br_rslp").build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer"); Map<String, List<String>> phrases = buildPhraseList(); for (String phrase : phrases.keySet()) { List<String> outputWords = phrases.get(phrase); TokenStream ts = analyzer.tokenStream("test", phrase); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset();//www. j a va2s. co m for (String expected : outputWords) { assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); } ts.close(); } }
From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java
License:Apache License
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class); CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class); stream1.reset();//ww w . j a v a2s.c om stream2.reset(); assertThat(stream1.incrementToken(), equalTo(true)); assertThat(stream2.incrementToken(), equalTo(true)); assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); assertThat(stream1.incrementToken(), equalTo(false)); assertThat(stream2.incrementToken(), equalTo(false)); stream1.end(); stream2.end(); stream1.close(); stream2.close(); }
From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java
License:Apache License
private void testToken(String source, String expected) throws IOException { Index index = new Index("test"); Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "polish_stem") .build();/*from ww w . j a va 2s.co m*/ AnalysisService analysisService = createAnalysisService(index, settings); TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer"); Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); }
From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java
License:Apache License
private void testAnalyzer(String source, String... expected_terms) throws IOException { Index index = new Index("test"); Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir()).build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("polish").analyzer(); TokenStream ts = analyzer.tokenStream("test", source); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w w w .j ava 2s . c o m for (String expected : expected_terms) { assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); } }
From source file:org.elasticsearch.index.analysis.SimpleUkrainianAnalyzerTests.java
License:Apache License
private static void testAnalyzer(String source, String... expected_terms) throws IOException { TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisUkrainianPlugin()); Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer(); TokenStream ts = analyzer.tokenStream("test", source); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset();/*from ww w . ja v a2s .c om*/ for (String expected : expected_terms) { assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); } assertThat(ts.incrementToken(), equalTo(false)); }
From source file:org.elasticsearch.index.analysis.SimpleVietnameseAnalyzerTests.java
License:Apache License
private static void testAnalyzer(String source, String... expected_terms) throws IOException { TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new AnalysisVietnamesePlugin()); Analyzer analyzer = analysis.indexAnalyzers.get("vietnamese").analyzer(); TokenStream ts = analyzer.tokenStream("test", source); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset();//from w w w . j a va 2 s . c o m for (String expected : expected_terms) { assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); } assertThat(ts.incrementToken(), equalTo(false)); }
From source file:org.elasticsearch.index.analysis.SmartChineseAnalysisTests.java
License:Apache License
/** * With the default analyzer or the default Chinese analyzer, you would get: * { "", "", "", "", "" }.//from w ww . j av a2 s.c om * <p> * But the SmartAnalyzer gets you the more desirable: * { "", "", "", "" }. * That is, "" (China) is one token. */ @Test public void analyzeSomeChineseText() throws Exception { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()) .createInjector(); Injector injector = new ModulesBuilder() .add(new IndexSettingsModule(index, EMPTY_SETTINGS), new IndexNameModule(index), new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)) .addProcessor(new SmartChineseAnalysisBinderProcessor())) .createChildInjector(parentInjector); AnalysisService analysisService = injector.getInstance(AnalysisService.class); Analyzer analyzer = analysisService.analyzer("smartcn").analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("message", "", 1.0f); allEntries.reset(); TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); List<String> terms = new ArrayList<String>(); while (stream.incrementToken()) { String tokText = termAtt.term(); terms.add(tokText); } MatcherAssert.assertThat(terms.size(), equalTo(4)); MatcherAssert.assertThat(terms, hasItems("", "", "", "")); }
From source file:org.elasticsearch.index.analysis.synonyms.SynonymsAnalysisTest.java
License:Apache License
private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field", source, 1.0f); allEntries.reset();/*from w w w . java 2 s .co m*/ TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (stream.incrementToken()) { sb.append(termAtt.toString()).append(" "); } MatcherAssert.assertThat(target, equalTo(sb.toString().trim())); }
From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java
License:Apache License
/** * Count position increments in a token stream. Package private for testing. * @param tokenStream token stream to count * @return number of position increments in a token stream * @throws IOException if tokenStream throws it *///from www .ja v a2 s . c om static int countPositions(TokenStream tokenStream) throws IOException { try { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { count += position.getPositionIncrement(); } tokenStream.end(); count += position.getPositionIncrement(); return count; } finally { tokenStream.close(); } }
From source file:org.elasticsearch.index.mapper.date.LegacyDateMappingTests.java
License:Apache License
private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA, String fieldB) throws IOException { assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.mappers().indexAnalyzer(), null), notNullValue());// w w w .java 2 s.c o m assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null), notNullValue()); TokenStream tokenStream = doc.rootDoc().getField(fieldA) .tokenStream(defaultMapper.mappers().indexAnalyzer(), null); tokenStream.reset(); LegacyNumericTermAttribute nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class); List<Long> values = new ArrayList<>(); while (tokenStream.incrementToken()) { values.add(nta.getRawValue()); } tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null); tokenStream.reset(); nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class); int pos = 0; while (tokenStream.incrementToken()) { assertThat(values.get(pos++), equalTo(nta.getRawValue())); } assertThat(pos, equalTo(values.size())); }