List of usage examples for org.apache.lucene.analysis CharArraySet add
public boolean add(char[] text)
From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java
License:Open Source License
public static CharArraySet getStopWordList(Version luceneVersion, String stopwordList, boolean ignoreCase) { String[] terms = stopwordList.split(","); CharArraySet stopwordSet = new CharArraySet(luceneVersion, terms.length, ignoreCase); for (String term : terms) { stopwordSet.add(term); }//from w w w . java 2s. c o m return CharArraySet.unmodifiableSet(stopwordSet); }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
/** * Creates a tokenizer based on param values * @param content - The text to tokenize * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT * @param stopWords - Provide a set of user defined stop words * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set. * If false, then only the user provided words will be used as the stop set * @param stemFilterType//from w w w.j a va 2 s . c o m */ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) { this.tokenizer = tokenizer; this.stemFilterType = stemFilterType; if (addToDefault) { CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET); ; for (String word : stopWords) { stopSet.add(word); } this.stopSet = stopSet; } else { stopSet = new CharArraySet(stopWords, true); } tokenStream = createTokenStream(content); }
From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java
License:Apache License
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { CharArraySet stopSet = new CharArraySet(1, true); stopSet.add("bar"); ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); String rules = "foo,bar,baz"; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);//from ww w.ja va 2s . c o m SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" }); }
From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java
License:Apache License
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { CharArraySet stopSet = new CharArraySet(1, true); stopSet.add("bar"); ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n" + "s(100000001,3,'baz',v,1,0)."; StringReader rulesReader = new StringReader(rules); parser.parse(rulesReader);//from ww w. j a va 2s.c o m SynonymMap synonymMap = parser.build(); Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); assertTokenStreamContents(ts, new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" }); }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
private void processDoc(int docid, String fieldName, Set<String> selector, CharArraySet set) throws IOException { Terms terms = searcher.getIndexReader().getTermVector(docid, fieldName); if (terms != null) { TermsEnum te = terms.iterator(); BytesRef bytes = te.next();//from w w w . ja v a 2 s . c o m while (bytes != null) { set.add(bytes); } } else if (analyzer != null) { Document document = searcher.doc(docid, selector); IndexableField[] fields = document.getFields(fieldName); if (fields == null) { return; } for (IndexableField field : fields) { String s = field.stringValue(); //is this possible if (s == null) { continue; } processFieldEntry(fieldName, s, set); } } else { throw new IllegalArgumentException( "The field must have a term vector or the analyzer must" + " not be null."); } }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, s); CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class); ts.reset();//from w w w .j a va2s .co m while (ts.incrementToken()) { set.add(cattr.toString()); } ts.end(); ts.close(); }
From source file:reviews.indexing.IndexReviews.java
License:Apache License
private static CharArraySet readStopWords(String filename) { CharArraySet stopwords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); try {/*from w ww . j a va2s. com*/ BufferedReader br; br = new BufferedReader(new FileReader(new File(filename))); String line; while ((line = br.readLine()) != null) { stopwords.add(line.trim()); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return stopwords; }