List of usage examples for org.apache.lucene.analysis BaseTokenStreamTestCase assertTokenStreamContents
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException
From source file:io.github.msurdi.redeye.core.lucene.RedeyeAnalizerTest.java
License:Apache License
@Test public void testGeneratedTokens() throws IOException { for (String[] testCase : testValues) { String testValue = testCase[0]; String expectedResult = testCase[1]; BaseTokenStreamTestCase.assertTokenStreamContents(analyzer.tokenStream(FIELD, testValue), expectedResult.split(" ")); }//from w w w . j av a2s . c om }
From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java
License:Apache License
@Test public void testStandardPlusTokenizer() throws Exception { try (Reader reader = new StringReader("Wha\u0301t's this thing do?")) { Map<String, String> args = new HashMap<String, String>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString()); StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args); try (Tokenizer stream = factory.create(reader)) { BaseTokenStreamTestCase.assertTokenStreamContents(stream, new String[] { "Wha\u0301t's", "this", "thing", "do", "?" }); }//from w ww . j a v a 2 s. c o m } }
From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java
License:Apache License
@Test public void testStandardPlusTokenizerMaxTokenLength() throws Exception { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 100; ++i) { builder.append("abcdefg"); // 7 * 100 = 700 char "word" }/*from w w w. j a v a 2 s . com*/ String longWord = builder.toString(); String content = "one two three " + longWord + " four five six"; try (Reader reader = new StringReader(content)) { Map<String, String> args = new HashMap<String, String>(); args.put("maxTokenLength", "1000"); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString()); StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args); try (Tokenizer stream = factory.create(reader)) { BaseTokenStreamTestCase.assertTokenStreamContents(stream, new String[] { "one", "two", "three", longWord, "four", "five", "six" }); } } }
From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java
License:Apache License
@Test public void testStandardPlusTokenizerNihongo() throws Exception { try (Reader reader = new StringReader("??? ? ?")) { Map<String, String> args = new HashMap<String, String>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString()); StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args); try (Tokenizer stream = factory.create(reader)) { BaseTokenStreamTestCase.assertTokenStreamContents(stream, new String[] { "", "?", "", "", "", "", "", "", "?", "?", "?", "?" }); }/* ww w .j a va2 s . com*/ } }
From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java
License:Apache License
@Test public void testStandardPlusTokenizerSpace() throws Exception { try (Reader reader = new StringReader("?\n?")) { Map<String, String> args = new HashMap<String, String>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString()); StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args); try (Tokenizer stream = factory.create(reader)) { BaseTokenStreamTestCase.assertTokenStreamContents(stream, new String[] { "?", "?" }); }// w w w . j a va 2 s . co m } }
From source file:org.apache.solr.analysis.ko.TestKoreanTokenizerFactory.java
License:Apache License
private void assertFilter(String input, String... tokens) throws IOException { TokenStream stream = filterK(tokenizerK(input)); BaseTokenStreamTestCase.assertTokenStreamContents(stream, tokens); }
From source file:org.apache.solr.analysis.TestWordDelimiterFilterFactory.java
License:Apache License
@Test public void testCustomTypes() throws Exception { String testText = "I borrowed $5,400.00 at 25% interest-rate"; ResourceLoader loader = new SolrResourceLoader("solr/collection1"); Map<String, String> args = new HashMap<String, String>(); args.put("generateWordParts", "1"); args.put("generateNumberParts", "1"); args.put("catenateWords", "1"); args.put("catenateNumbers", "1"); args.put("catenateAll", "0"); args.put("splitOnCaseChange", "1"); /* default behavior */ WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args); factoryDefault.inform(loader);//from ww w. ja v a 2 s . c o m TokenStream ts = factoryDefault .create(new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" }); ts = factoryDefault .create(new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo", "bar", "foobar" }); /* custom behavior */ args = new HashMap<String, String>(); // use a custom type mapping args.put("generateWordParts", "1"); args.put("generateNumberParts", "1"); args.put("catenateWords", "1"); args.put("catenateNumbers", "1"); args.put("catenateAll", "0"); args.put("splitOnCaseChange", "1"); args.put("types", "wdftypes.txt"); WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args); factoryCustom.inform(loader); ts = factoryCustom.create(new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" }); /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */ ts = factoryCustom .create(new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo\u200Dbar" }); }
From source file:org.elasticsearch.index.analysis.SimplePhoneticAnalysisTests.java
License:Apache License
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("ABADIAS")); String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia", "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS", "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" }; BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
From source file:org.elasticsearch.index.analysis.SimplePhoneticAnalysisTests.java
License:Apache License
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("Rimbault")); String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt", "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" }; BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
From source file:org.elasticsearch.index.analysis.SimplePhoneticAnalysisTests.java
License:Apache License
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("chauptman")); String[] expected = new String[] { "473660", "573660" }; assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class)); BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }