List of usage examples for org.apache.lucene.analysis.core WhitespaceTokenizer setReader
public final void setReader(Reader input)
From source file:org.elasticsearch.analysis.common.WhitespaceTokenizerFactoryTests.java
License:Apache License
public void testSimpleWhiteSpaceTokenizer() throws IOException { final Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(new Index("test", "_na_"), indexSettings);//from www. j a v a2 s . c o m WhitespaceTokenizer tokenizer = (WhitespaceTokenizer) new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", Settings.EMPTY).create(); try (Reader reader = new StringReader("one, two, three")) { tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "one,", "two,", "three" }); } }
From source file:org.elasticsearch.analysis.common.WhitespaceTokenizerFactoryTests.java
License:Apache License
public void testMaxTokenLength() throws IOException { final Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(new Index("test", "_na_"), indexSettings);//from w w w . j a va 2s . c om final Settings settings = Settings.builder().put(WhitespaceTokenizerFactory.MAX_TOKEN_LENGTH, 2).build(); WhitespaceTokenizer tokenizer = (WhitespaceTokenizer) new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", settings).create(); try (Reader reader = new StringReader("one, two, three")) { tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "on", "e,", "tw", "o,", "th", "re", "e" }); } final Settings defaultSettings = Settings.EMPTY; tokenizer = (WhitespaceTokenizer) new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", defaultSettings).create(); String veryLongToken = RandomStrings.randomAsciiAlphanumOfLength(random(), 256); try (Reader reader = new StringReader(veryLongToken)) { tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { veryLongToken.substring(0, 255), veryLongToken.substring(255) }); } final Settings tooLongSettings = Settings.builder() .put(WhitespaceTokenizerFactory.MAX_TOKEN_LENGTH, 1024 * 1024 + 1).build(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", tooLongSettings) .create()); assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 1048577", e.getMessage()); final Settings negativeSettings = Settings.builder().put(WhitespaceTokenizerFactory.MAX_TOKEN_LENGTH, -1) .build(); e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", negativeSettings) .create()); assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage()); }
From source file:org.opensextant.solrtexttagger.ConcatenateFilterTest.java
License:Open Source License
public void testTypical() throws IOException { String NYC = "new york city"; WhitespaceTokenizer stream = new WhitespaceTokenizer(); stream.setReader(new StringReader(NYC)); ConcatenateFilter filter = new ConcatenateFilter(stream); try {/* ww w .ja v a 2 s . c o m*/ assertTokenStreamContents(filter, new String[] { NYC }, new int[] { 0 }, new int[] { NYC.length() }, new String[] { "shingle" }, new int[] { 1 }, null, NYC.length(), true); } catch (AssertionError e) { //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly. // It's manner of checking this is imperfect and incompatible with // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(), // which is weird. To the best of my ability, end() appears to be implemented correctly. if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()")) throw e; } }