List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java
License:Apache License
@Test public void testBoostOnEagerTokenizer() throws Exception { AllEntries allEntries = new AllEntries(); allEntries.addText("field1", "all", 2.0f); allEntries.addText("field2", "your", 1.0f); allEntries.addText("field1", "boosts", 0.5f); allEntries.reset();/* ww w. j ava2 s.co m*/ // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries, new WhitespaceAnalyzer(Lucene.VERSION)); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); ts.reset(); for (int i = 0; i < 3; ++i) { assertTrue(ts.incrementToken()); final String term; final float boost; switch (i) { case 0: term = "all"; boost = 2; break; case 1: term = "your"; boost = 1; break; case 2: term = "boosts"; boost = 0.5f; break; default: throw new AssertionError(); } assertEquals(term, termAtt.toString()); final BytesRef payload = payloadAtt.getPayload(); if (payload == null || payload.length == 0) { assertEquals(boost, 1f, 0.001f); } else { assertEquals(4, payload.length); final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset); assertEquals(boost, b, 0.001f); } } assertFalse(ts.incrementToken()); }
From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java
License:Apache License
@Override protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException { // luckily this is single thread access and we dont need a thread local. hasDocValsNow = false;// w w w .j a va2 s . c o m super.parseCreateField(context, fields); hasDocValsNow = true; String value = null; if (context.externalValueSet()) { value = (String) context.externalValue(); } else { for (Field f : fields) { Class<?> fClass = f.getClass(); if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) { value = f.stringValue(); break; } } } if (value != null) { TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String token = cattr.toString(); // take the first token and make it a doc value fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token))); break; } stream.end(); stream.close(); } }
From source file:org.elasticsearch.index.analysis.Analysis.java
License:Apache License
/** * Check whether the provided token stream is able to provide character * terms.//from www . j av a 2 s. c om * <p>Although most analyzers generate character terms (CharTermAttribute), * some token only contain binary terms (BinaryTermAttribute, * CharTermAttribute being a special type of BinaryTermAttribute), such as * {@link NumericTokenStream} and unsuitable for highlighting and * more-like-this queries which expect character terms.</p> */ public static boolean isCharacterTokenStream(TokenStream tokenStream) { try { tokenStream.addAttribute(CharTermAttribute.class); return true; } catch (IllegalArgumentException e) { return false; } }
From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java
License:Apache License
public void testConfigureCamelCaseTokenFilter() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter") .build();/*from ww w .j a v a2s . c o m*/ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()) .getAnalysisRegistry().build(idxSettings); try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 2, token.size()); assertEquals("j2se", token.get(0)); assertEquals("j2ee", token.get(1)); } try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { assertNotNull(custom_analyser); TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); List<String> token = new ArrayList<>(); while (tokenStream.incrementToken()) { token.add(charTermAttribute.toString()); } assertEquals(token.toString(), 6, token.size()); assertEquals("j", token.get(0)); assertEquals("2", token.get(1)); assertEquals("se", token.get(2)); assertEquals("j", token.get(3)); assertEquals("2", token.get(4)); assertEquals("ee", token.get(5)); } }
From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java
License:Apache License
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); Injector injector = new ModulesBuilder() .add(new IndexSettingsModule(index, settings), new IndexNameModule(index), new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))) .createChildInjector(parentInjector); AnalysisService analysisService = injector.getInstance(AnalysisService.class); Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field1", text, 1.0f); allEntries.reset();/*w ww. j ava 2s . co m*/ TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<String>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }
From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java
License:Apache License
@Test public void testAttributeEqual() throws IOException { final int precisionStep = 8; final double value = randomDouble(); NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep); final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value)); final NumericTokenStream ts2 = new NumericTokenStream(precisionStep); ts2.setDoubleValue(value);/*w ww .j a va2 s. co m*/ final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class); final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class); final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class); final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class); ts1.reset(); ts2.reset(); while (ts1.incrementToken()) { assertThat(ts2.incrementToken(), is(true)); assertThat(posInc1, equalTo(posInc2)); // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070) assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue())); assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift())); } assertThat(ts2.incrementToken(), is(false)); ts1.end(); ts2.end(); }
From source file:org.elasticsearch.index.analysis.PatternTokenizerTests.java
License:Apache License
/** * TODO: rewrite tests not to use string comparison. *///from w w w. j av a 2 s . c o m private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); in.reset(); while (in.incrementToken()) { if (out.length() > 0) out.append(' '); out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); } in.close(); return out.toString(); }
From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java
License:Apache License
@Test public void testThatDefaultAndStandardAnalyzerChangedIn10Beta1() throws IOException { Analyzer currentStandardAnalyzer = PreBuiltAnalyzers.STANDARD.getAnalyzer(Version.V_1_0_0_Beta1); Analyzer currentDefaultAnalyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1); // special case, these two are the same instance assertThat(currentDefaultAnalyzer, is(currentStandardAnalyzer)); PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1); final int n = atLeast(10); Version version = Version.CURRENT;/*from w w w. ja v a 2 s.com*/ for (int i = 0; i < n; i++) { if (version.equals(Version.V_1_0_0_Beta1)) { assertThat(currentDefaultAnalyzer, is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))); } else { assertThat(currentDefaultAnalyzer, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)))); } Analyzer analyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(version); TokenStream ts = analyzer.tokenStream("foo", "This is it Dude"); ts.reset(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); List<String> list = new ArrayList<String>(); while (ts.incrementToken()) { list.add(charTermAttribute.toString()); } if (version.onOrAfter(Version.V_1_0_0_Beta1)) { assertThat(list.size(), is(4)); assertThat(list, contains("this", "is", "it", "dude")); } else { assertThat(list.size(), is(1)); assertThat(list, contains("dude")); } ts.close(); version = randomVersion(); } }
From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java
License:Apache License
@Test public void testAnalyzerChangedIn10RC1() throws IOException { Analyzer pattern = PreBuiltAnalyzers.PATTERN.getAnalyzer(Version.V_1_0_0_RC1); Analyzer standardHtml = PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(Version.V_1_0_0_RC1); final int n = atLeast(10); Version version = Version.CURRENT;/*from w w w.j a va2s . c o m*/ for (int i = 0; i < n; i++) { if (version.equals(Version.V_1_0_0_RC1)) { assertThat(pattern, is(PreBuiltAnalyzers.PATTERN.getAnalyzer(version))); assertThat(standardHtml, is(PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version))); } else { assertThat(pattern, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)))); assertThat(standardHtml, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)))); } Analyzer analyzer = randomBoolean() ? PreBuiltAnalyzers.PATTERN.getAnalyzer(version) : PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version); TokenStream ts = analyzer.tokenStream("foo", "This is it Dude"); ts.reset(); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); List<String> list = new ArrayList<String>(); while (ts.incrementToken()) { list.add(charTermAttribute.toString()); } if (version.onOrAfter(Version.V_1_0_0_RC1)) { assertThat(list.toString(), list.size(), is(4)); assertThat(list, contains("this", "is", "it", "dude")); } else { assertThat(list.size(), is(1)); assertThat(list, contains("dude")); } ts.close(); version = randomVersion(); } }
From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java
License:Apache License
@Test public void testRSLPRules() throws Exception { Index index = new Index("test"); Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "br_rslp").build(); AnalysisService analysisService = createAnalysisService(index, settings); TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer"); Tokenizer tokenizer = new KeywordTokenizer(); Map<String, String> words = buildWordList(); Set<String> inputWords = words.keySet(); for (String word : inputWords) { tokenizer.setReader(new StringReader(word)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset();/*from w w w. j a v a 2s.c o m*/ assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(words.get(word))); ts.close(); } }