List of usage examples for org.apache.lucene.analysis MockTokenizer MockTokenizer
public MockTokenizer()
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
public void test() throws Exception { Directory dir = newDirectory();//from ww w.j a v a 2 s . c o m Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); if (fieldName.contains("payloadsFixed")) { TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); return new TokenStreamComponents(tokenizer, filter); } else if (fieldName.contains("payloadsVariable")) { TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); return new TokenStreamComponents(tokenizer, filter); } else { return new TokenStreamComponents(tokenizer); } } }; IndexWriterConfig iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); // TODO we could actually add more fields implemented with different PFs // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsOnlyType.setStoreTermVectors(true); docsOnlyType.setIndexOptions(IndexOptions.DOCS); FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsAndFreqsType.setStoreTermVectors(true); docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); // turn these on for a cross-check positionsType.setStoreTermVectors(true); positionsType.setStoreTermVectorPositions(true); positionsType.setStoreTermVectorOffsets(true); positionsType.setStoreTermVectorPayloads(true); FieldType offsetsType = new FieldType(positionsType); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field field1 = new Field("field1docs", "", docsOnlyType); Field field2 = new Field("field2freqs", "", docsAndFreqsType); Field field3 = new Field("field3positions", "", positionsType); Field field4 = new Field("field4offsets", "", offsetsType); Field field5 = new Field("field5payloadsFixed", "", positionsType); Field field6 = new Field("field6payloadsVariable", "", positionsType); Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); doc.add(field1); doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); doc.add(field6); doc.add(field7); doc.add(field8); for (int i = 0; i < MAXDOC; i++) { String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random()); field1.setStringValue(stringValue); field2.setStringValue(stringValue); field3.setStringValue(stringValue); field4.setStringValue(stringValue); field5.setStringValue(stringValue); field6.setStringValue(stringValue); field7.setStringValue(stringValue); field8.setStringValue(stringValue); iw.addDocument(doc); } iw.close(); verify(dir); TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw2 = new IndexWriter(dir, iwc); iw2.forceMerge(1); iw2.close(); verify(dir); dir.close(); }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java
License:Apache License
/** * Check that the deprecated name "nGram" issues a deprecation warning for indices created since 6.3.0 *///w w w. ja va2 s . co m public void testNGramDeprecationWarning() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT)) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter; TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram"); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); assertNotNull(tokenFilterFactory.create(tokenizer)); assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. " + "Please change the filter name to [ngram] instead."); } }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java
License:Apache License
/** * Check that the deprecated name "nGram" does NOT issues a deprecation warning for indices created before 6.4.0 *///w w w .j a va 2s . c o m public void testNGramNoDeprecationWarningPre6_4() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_5_0_0, Version.V_6_3_0)) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter; TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram"); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); assertNotNull(tokenFilterFactory.create(tokenizer)); } }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java
License:Apache License
/** * Check that the deprecated name "edgeNGram" issues a deprecation warning for indices created since 6.3.0 *//* w w w. j av a 2 s. c om*/ public void testEdgeNGramDeprecationWarning() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT)) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter; TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram"); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); assertNotNull(tokenFilterFactory.create(tokenizer)); assertWarnings( "The [edgeNGram] token filter name is deprecated and will be removed in a future version. " + "Please change the filter name to [edge_ngram] instead."); } }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java
License:Apache License
/** * Check that the deprecated name "edgeNGram" does NOT issues a deprecation warning for indices created before 6.4.0 *//*from w ww . j ava2 s. c om*/ public void testEdgeNGramNoDeprecationWarningPre6_4() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_5_0_0, Version.V_6_3_0)) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter; TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram"); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); assertNotNull(tokenFilterFactory.create(tokenizer)); } }
From source file:org.elasticsearch.analysis.common.NGramTokenizerFactoryTests.java
License:Apache License
public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { final Index index = new Index("test", "_na_"); final String name = "ngr"; Version v = randomVersion(random()); Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3); boolean reverse = random().nextBoolean(); if (reverse) { builder.put("side", "back"); }// w ww .j av a 2 s .c om Settings settings = builder.build(); Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id) .build(); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory( IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) .create(tokenizer); if (reverse) { assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class)); } else { assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); } } }
From source file:org.elasticsearch.index.analysis.AnalysisPolishFactoryTests.java
License:Apache License
private void testThreadSafety(TokenFilterFactory factory) throws IOException { final Analyzer analyzer = new Analyzer() { @Override/*from ww w . j av a 2s . c om*/ protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); return new TokenStreamComponents(tokenizer, factory.create(tokenizer)); } }; BaseTokenStreamTestCase.checkRandomData(random(), analyzer, 100); }
From source file:org.elasticsearch.index.mapper.core.LegacyTokenCountFieldMapperTests.java
License:Apache License
public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0);/*from ww w. ja v a 2s . c o m*/ Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] { t1, t2, t3 }; Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new MockTokenizer(), tokenStream); } }; assertThat(LegacyTokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); }
From source file:org.elasticsearch.index.mapper.TokenCountFieldMapperTests.java
License:Apache License
public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0);//from w ww . j a va 2 s. c om Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] { t1, t2, t3 }; Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new MockTokenizer(), tokenStream); } }; assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); }