List of usage examples for org.apache.lucene.analysis CharArraySet CharArraySet
public CharArraySet(Collection<?> c, boolean ignoreCase)
From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java
License:Apache License
@Test public void testEmptyInput() throws IOException { whitelisted = new CharArraySet(Arrays.asList(""), false); inputText = new StringReader(""); List<String> tokens = collectTokens(inputText, whitelisted); List<String> expected = Collections.emptyList(); assertTrue(tokens.equals(expected)); }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
/** * Creates a tokenizer based on param values * @param content - The text to tokenize * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT * @param stopWords - Provide a set of user defined stop words * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set. * If false, then only the user provided words will be used as the stop set * @param stemFilterType//from w w w . j a va 2s .c om */ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) { this.tokenizer = tokenizer; this.stemFilterType = stemFilterType; if (addToDefault) { CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET); ; for (String word : stopWords) { stopSet.add(word); } this.stopSet = stopSet; } else { stopSet = new CharArraySet(stopWords, true); } tokenStream = createTokenStream(content); }
From source file:org.apache.solr.analysis.CommonGramsFilter.java
License:Apache License
/** * Build a CharArraySet from an array of common words, appropriate for passing * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is * false.//from w w w . j ava 2 s .com * * @param commonWords Array of common words which will be converted into the CharArraySet * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words * @deprecated create a CharArraySet with CharArraySet instead */ @Deprecated public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) { CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase); commonSet.addAll(Arrays.asList(commonWords)); return commonSet; }
From source file:org.apache.solr.analysis.KeepWordFilter.java
License:Apache License
/** @deprecated Use {@link #KeepWordFilter(boolean, TokenStream, CharArraySet)} instead */ @Deprecated// www . jav a 2 s. c om public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase) { this(false, in, new CharArraySet(words, ignoreCase)); }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
public void testPositionIncrements() throws Exception { final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false); /* analyzer that uses whitespace + wdf */ Analyzer a = new Analyzer() { public TokenStream tokenStream(String field, Reader reader) { return new WordDelimiterFilter(new WhitespaceTokenizer(reader), 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);/*from w w w . jav a2 s .com*/ } }; /* in this case, works as expected. */ assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, new int[] { 1, 1 }); /* only in this case, posInc of 2 ?! */ assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" }, new int[] { 0, 9, 12, 9 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 1, 1, 0 }); assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 1, 1 }); /* analyzer that will consume tokens with large position increments */ Analyzer a2 = new Analyzer() { public TokenStream tokenStream(String field, Reader reader) { return new WordDelimiterFilter(new LargePosIncTokenFilter(new WhitespaceTokenizer(reader)), 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords); } }; /* increment of "largegap" is preserved */ assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, new int[] { 1, 10, 1 }); /* the "/" had a position increment of 10, where did it go?!?!! */ assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, new int[] { 1, 11 }); /* in this case, the increment of 10 from the "/" is carried over */ assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" }, new int[] { 0, 9, 12, 9 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 11, 1, 0 }); assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 11, 1 }); Analyzer a3 = new Analyzer() { public TokenStream tokenStream(String field, Reader reader) { StopFilter filter = new StopFilter(new WhitespaceTokenizer(reader), StandardAnalyzer.STOP_WORDS_SET); filter.setEnablePositionIncrements(true); return new WordDelimiterFilter(filter, 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords); } }; assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucene", "solr", "lucenesolr" }, new int[] { 0, 7, 0 }, new int[] { 6, 11, 11 }, new int[] { 1, 1, 0 }); /* the stopword should add a gap here */ assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucene", "solr", "lucenesolr" }, new int[] { 4, 11, 4 }, new int[] { 10, 15, 15 }, new int[] { 2, 1, 0 }); }
From source file:org.crosswire.jsword.index.lucene.analysis.AnalyzerFactoryTest.java
License:Open Source License
public void testEngStemming() throws ParseException { AbstractBookAnalyzer myAnalyzer = new EnglishLuceneAnalyzer(); QueryParser parser = new QueryParser(Version.LUCENE_29, field, myAnalyzer); String testInput = "Surely will every man walketh"; Query query = parser.parse(testInput); // assertTrue(myAnalyzer instanceof SimpleLuceneAnalyzer); // After Diacritic filtering assertTrue(query.toString().indexOf(field + ":sure ") > -1); assertTrue(query.toString().indexOf(field + ":everi") > -1); myAnalyzer.setDoStemming(false);/* www . j a v a2s. co m*/ query = parser.parse(testInput); assertTrue(query.toString().indexOf(field + ":surely") > -1); assertTrue(query.toString().indexOf(field + ":every") > -1); // enable stop word myAnalyzer.setDoStopWords(true); query = parser.parse(testInput); assertTrue(query.toString().indexOf(field + ":will") == -1); // set custom stop word myAnalyzer.setDoStopWords(true); String[] stopWords = { "thy", "ye", "unto", "shalt" }; myAnalyzer.setStopWords(new CharArraySet(Arrays.asList(stopWords), false)); testInput = "Upon thy belly Shalt thou go"; query = parser.parse(testInput); assertTrue(query.toString().indexOf(field + ":shalt") == -1); assertTrue(query.toString().indexOf(field + ":thy") == -1); assertTrue(query.toString().indexOf(field + ":upon") > -1); System.out.println(query.toString()); }
From source file:org.crosswire.jsword.index.lucene.analysis.EnglishLuceneAnalyzerTest.java
License:Open Source License
public void testSetDoStopWords() throws ParseException { String testInput = "Surely will every man walketh"; Query query = parser.parse(testInput); // enable stop word myAnalyzer.setDoStopWords(true);//ww w .ja v a 2 s.c om query = parser.parse(testInput); assertTrue(query.toString().indexOf(field + ":will") == -1); // set custom stop word myAnalyzer.setDoStopWords(true); String[] stopWords = { "thy", "ye", "unto", "shalt" }; myAnalyzer.setStopWords(new CharArraySet(Arrays.asList(stopWords), false)); testInput = "Upon thy belly Shalt thou go"; query = parser.parse(testInput); // System.out.println("ParsedQuery- "+ query.toString()); assertTrue(query.toString().indexOf(field + ":shalt") == -1); assertTrue(query.toString().indexOf(field + ":thy") == -1); assertTrue(query.toString().indexOf(field + ":upon") > -1); }
From source file:org.elasticsearch.analysis.common.KeywordMarkerTokenFilterFactory.java
License:Apache License
KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); boolean ignoreCase = settings.getAsBoolean("ignore_case", false); String patternString = settings.get("keywords_pattern"); if (patternString != null) { // a pattern for matching keywords is specified, as opposed to a // set of keyword strings to match against if (settings.get("keywords") != null || settings.get("keywords_path") != null) { throw new IllegalArgumentException( "cannot specify both `keywords_pattern` and `keywords` or `keywords_path`"); }//from ww w . jav a 2s .c o m keywordPattern = Pattern.compile(patternString); keywordLookup = null; } else { Set<?> rules = Analysis.getWordSet(env, settings, "keywords"); if (rules == null) { throw new IllegalArgumentException("keyword filter requires either `keywords`, `keywords_path`, " + "or `keywords_pattern` to be configured"); } // a set of keywords (or a path to them) is specified keywordLookup = new CharArraySet(rules, ignoreCase); keywordPattern = null; } }
From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java
License:Apache License
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { CharArraySet stopSet = new CharArraySet(1, true); stopSet.add("bar"); ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); String rules = "foo,bar,baz"; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);/*from www . j a va2 s . c o m*/ SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" }); }
From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java
License:Apache License
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { CharArraySet stopSet = new CharArraySet(1, true); stopSet.add("bar"); ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n" + "s(100000001,3,'baz',v,1,0)."; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);/*from w w w . j a v a2 s . co m*/ SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" }); }