List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer()
From source file:org.elasticsearch.action.termvectors.AbstractTermVectorsTestCase.java
License:Apache License
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException { Map<String, Analyzer> mapping = new HashMap<>(); for (TestFieldSetting field : testDocs[0].fieldSettings) { if (field.storedPayloads) { mapping.put(field.name, new Analyzer() { @Override/*from w ww.ja v a2s . c o m*/ protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); TokenFilter filter = new LowerCaseFilter(tokenizer); filter = new TypeAsPayloadTokenFilter(filter); return new TokenStreamComponents(tokenizer, filter); } }); } } PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), mapping); Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(wrapper); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, conf); for (TestDoc doc : testDocs) { Document d = new Document(); d.add(new Field("id", doc.id, StringField.TYPE_STORED)); for (int i = 0; i < doc.fieldContent.length; i++) { FieldType type = new FieldType(TextField.TYPE_STORED); TestFieldSetting fieldSetting = doc.fieldSettings[i]; type.setStoreTermVectorOffsets(fieldSetting.storedOffset); type.setStoreTermVectorPayloads(fieldSetting.storedPayloads); type.setStoreTermVectorPositions( fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset); type.setStoreTermVectors(true); type.freeze(); d.add(new Field(fieldSetting.name, doc.fieldContent[i], type)); } writer.updateDocument(new Term("id", doc.id), d); writer.commit(); } writer.close(); return DirectoryReader.open(dir); }
From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java
License:Apache License
public void testDefault() throws IOException { TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram"); String source = "????????"; String[] expected = new String[] { "??", "???", "?", "", "?", "?", "", "?", "??", "??", "??" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java
License:Apache License
public void testNoFlags() throws IOException { TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags"); String source = "????????"; String[] expected = new String[] { "??", "???", "?", "", "?", "?", "", "?", "??", "??", "??" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java
License:Apache License
public void testHanOnly() throws IOException { TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only"); String source = "????????"; String[] expected = new String[] { "", "??", "?", "", "?", "", "?", "?", "?", "?" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java
License:Apache License
public void testHanUnigramOnly() throws IOException { TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only"); String source = "????????"; String[] expected = new String[] { "", "??", "?", "", "", "", "?", "", "", "", "?", "?", "?", "?" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java
License:Apache License
public void testDisableGraph() throws IOException { TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags"); TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only"); String source = "????????"; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); try (TokenStream tokenStream = allFlagsFactory.create(tokenizer)) { // This config outputs different size of ngrams so graph analysis is disabled assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class)); }// w w w .jav a 2 s. com tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); try (TokenStream tokenStream = hanOnlyFactory.create(tokenizer)) { // This config uses only bigrams so graph analysis is enabled assertFalse(tokenStream.hasAttribute(DisableGraphAttribute.class)); } }
From source file:org.elasticsearch.analysis.common.KeepTypesFilterFactoryTests.java
License:Apache License
public void testKeepTypes() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.keep_numbers.type", "keep_types") .putList("index.analysis.filter.keep_numbers.types", new String[] { "<NUM>", "<SOMETHINGELSE>" }) .build();// www .j a va 2s .co m ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; String[] expected = new String[] { "123" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 }); }
From source file:org.elasticsearch.analysis.common.SnowballAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, and a {@link SnowballFilter} */ @Override//from w w w .j a v a 2s . c om public TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream result = tokenizer; // remove the possessive 's for english stemmers if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) result = new EnglishPossessiveFilter(result); // Use a special lowercase filter for turkish, the stemmer expects it. if (name.equals("Turkish")) result = new TurkishLowerCaseFilter(result); else result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(result, stopSet); result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
From source file:org.elasticsearch.analysis.hunspell.cs.CzechHunspellAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String field) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StopFilter(source, stopwords); if (!this.stemExclusionTable.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); }//from w ww .j a va 2s . co m result = new HunspellStemFilter(result, dictionary); result = new LowerCaseFilter(result); return new TokenStreamComponents(source, result); }
From source file:org.elasticsearch.analysis.hunspell.fr.FrenchHunspellAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String field) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new ElisionFilter(source, FrenchAnalyzer.DEFAULT_ARTICLES); result = new StopFilter(result, stopwords); if (!this.stemExclusionTable.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); }/* w w w . j a va 2 s .c o m*/ result = new HunspellStemFilter(result, dictionary); result = new LowerCaseFilter(result); return new TokenStreamComponents(source, result); }