List of usage examples for org.apache.lucene.analysis MockTokenizer KEYWORD
CharacterRunAutomaton KEYWORD
To view the source code for org.apache.lucene.analysis MockTokenizer KEYWORD.
Click Source Link
From source file:org.apache.solr.analysis.MockTokenizerFactory.java
License:Apache License
/** Creates a new MockTokenizerFactory */ public MockTokenizerFactory(Map<String, String> args) { super(args);// w ww . j a va2 s. c o m String patternArg = get(args, "pattern", Arrays.asList("keyword", "simple", "whitespace")); if ("keyword".equalsIgnoreCase(patternArg)) { pattern = MockTokenizer.KEYWORD; } else if ("simple".equalsIgnoreCase(patternArg)) { pattern = MockTokenizer.SIMPLE; } else { pattern = MockTokenizer.WHITESPACE; } enableChecks = getBoolean(args, "enableChecks", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
From source file:org.apache.solr.spelling.TestSuggestSpellingConverter.java
License:Apache License
public void testSimple() throws Exception { // lowercases only! converter.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.KEYWORD, true)); assertConvertsTo("This is a test", new String[] { "this is a test" }); }
From source file:org.elasticsearch.analysis.hunspell.TestStemming.java
License:Apache License
public void test() throws Exception { LineNumberReader reader = new LineNumberReader(IOUtils.getDecodingReader( getClass().getResourceAsStream("/stemming-data/" + language + ".txt"), StandardCharsets.UTF_8)); dictionaryStream = getClass().getResourceAsStream("/" + language + "/" + language + ".dic"); affixStream = getClass().getResourceAsStream("/" + language + "/" + language + ".aff"); final Dictionary dictionary = new Dictionary(affixStream, dictionaryStream); Analyzer analyzer = new Analyzer() { @Override//from www. j av a 2 s. c om protected TokenStreamComponents createComponents(String field) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, false); return new TokenStreamComponents(tokenizer, filter); } }; String line = null; while ((line = reader.readLine()) != null) { int comment = line.indexOf('#'); if (comment >= 0) { line = line.substring(0, comment); } line = line.trim(); if (line.isEmpty()) { continue; } String elements[] = line.split("\\s+"); if (elements.length != 2) { throw new RuntimeException("Illegal number of elements in line: " + reader.getLineNumber()); } String input = elements[0]; String outputs[] = elements[1].split(","); compareStems(analyzer, input, outputs, reader.getLineNumber()); } analyzer.close(); reader.close(); }
From source file:org.elasticsearch.test.MockKeywordPlugin.java
License:Apache License
@Override public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() { return singletonMap("keyword", (indexSettings, environment, name, settings) -> { class Factory implements TokenizerFactory { @Override/*from w w w. j av a2 s .c o m*/ public Tokenizer create() { return new MockTokenizer(MockTokenizer.KEYWORD, false); } } return new Factory(); }); }
From source file:org.tallison.lucene.queryparser.spans.TestAdvancedAnalyzers.java
License:Apache License
@BeforeClass public static void beforeClass() throws Exception { lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true); Map<String, String> attrs = new HashMap<>(); attrs.put("generateWordParts", "1"); attrs.put("generateNumberParts", "1"); attrs.put("catenateWords", "1"); attrs.put("catenateNumbers", "1"); attrs.put("catenateAll", "1"); attrs.put("splitOnCaseChange", "1"); attrs.put("preserveOriginal", "1"); complexAnalyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(TestAdvancedAnalyzers.class)) .withTokenizer("whitespace").addTokenFilter("worddelimiter", attrs).addTokenFilter("kstem") .addTokenFilter("removeduplicates").build(); synAnalyzer = new Analyzer() { @Override/*from ww w. j a v a2 s.c o m*/ public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); filter = new MockSynFilter(filter); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockNonWhitespaceFilter(new MockSynFilter(in)); } }; baseAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockNonWhitespaceFilter(new LowerCaseFilter(in)); } }; ucVowelAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } }; ucVowelMTAnalyzer = new Analyzer() { @Override public TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } }; Analyzer tmpUCVowelAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } }; directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(baseAnalyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "abc_def", "lmnop", "abc one", "abc two", "qrs one", "qrs two", "tuv one", "tuv two", "qrs tuv", "qrs_tuv" }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES); tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i])); doc.add(tf); doc.add(newTextField(FIELD3, docs[i], Field.Store.YES)); TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES); tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i])); doc.add(tf4); writer.addDocument(doc); } reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); }
From source file:org.tallison.lucene.queryparser.spans.TestOverallSpanQueryParser.java
License:Apache License
@BeforeClass public static void beforeClass() throws Exception { ANALYZER = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); MULTITERM_ANALYZER = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true); DIRECTORY = newDirectory();/* w w w. j a va2s.c o m*/ RandomIndexWriter writer = new RandomIndexWriter(random(), DIRECTORY, newIndexWriterConfig(ANALYZER) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); String[] f1Docs = new String[] { "quick brown AND fox", //0 "quick brown AND dog", //1 "quick brown dog", //2 "whan that aprile with its shoures perced", //3 "its shoures pierced", //4 "its shoures perced", //5 "#####", //before asterisk //6 "&&&&&", //after asterisk for range query //7 "ab*de", //8 "abcde", //9 "blah disco fever blah", //10 "blah bieber fever blah", //11 "blah dengue fever blah", //12 "blah saturday night fever with john travolta", //13 "understanding (span query)", //14 "understanding (sp'an query)", //15 "understanding something about (span query)", //16 "0 1 fox 3 4 5 fox 7 8 9 10 fox"//17 }; String[] f2Docs = new String[] { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen" }; for (int i = 0; i < f1Docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, f1Docs[i], Field.Store.YES)); doc.add(newTextField(FIELD2, f2Docs[i], Field.Store.YES)); writer.addDocument(doc); } READER = writer.getReader(); SEARCHER = newSearcher(READER); writer.close(); PARSER = new SpanQueryParser(FIELD1, ANALYZER, MULTITERM_ANALYZER); }
From source file:org.tallison.lucene.queryparser.spans.TestQPTestBaseSpanQuery.java
License:Apache License
public CommonQueryParserConfiguration getParserConfig(Analyzer a, Analyzer mtAnalyzer) throws Exception { if (a == null) { a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); }/*from w ww . j a v a 2 s. c o m*/ if (mtAnalyzer == null) { mtAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true); } SQPTestingConfig qp = new SQPTestingConfig(getDefaultField(), a, mtAnalyzer); qp.setDefaultOperator(QueryParserBase.OR_OPERATOR); qp.setAnalyzeRangeTerms(true); return qp; }
From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java
License:Apache License
@BeforeClass public static void beforeClass() throws Exception { lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true); noopMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); noStopAnalyzer = new Analyzer() { @Override/*w w w . j a va2 s . c om*/ public TokenStream normalize(String fieldName, TokenStream in) { return new MockStandardTokenizerFilter(new LowerCaseFilter(in)); } @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); TokenFilter filter = new MockStandardTokenizerFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } }; stopAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); TokenFilter filter = new MockStandardTokenizerFilter(tokenizer); filter = new MockTokenFilter(filter, STOP_WORDS); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); } }; directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(stopAnalyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "the quick brown fox ", "jumped over the lazy brown dog and the brown green cat", "quick green fox", "abcdefghijk", "over green lazy", // longish doc for recursion test "eheu fugaces postume postume labuntur anni nec " + "pietas moram rugis et instanti senectae " + "adferet indomitaeque morti", // non-whitespace language "\u666E \u6797 \u65AF \u987F \u5927 \u5B66", "reg/exp", "/regex/", "fuzzy~2", "wil*card", "wil?card", "prefi*", "single'quote" }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD, docs[i], Field.Store.YES)); writer.addDocument(doc); } reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); }