Example usage for org.apache.lucene.analysis CharArraySet add

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet add.

Prototype

public boolean add(char[] text)

Source Link

Document

Add this char[] directly to the set.

Usage

From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java

License:Open Source License

public static CharArraySet getStopWordList(Version luceneVersion, String stopwordList, boolean ignoreCase) {
    String[] terms = stopwordList.split(",");
    CharArraySet stopwordSet = new CharArraySet(luceneVersion, terms.length, ignoreCase);
    for (String term : terms) {
        stopwordSet.add(term);
    }//from w  w  w . java 2s.  c  o m
    return CharArraySet.unmodifiableSet(stopwordSet);
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

/**
 * Creates a tokenizer based on param values
 * @param content - The text to tokenize
 * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
 * @param stopWords - Provide a set of user defined stop words
 * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
 * If false, then only the user provided words will be used as the stop set
 * @param stemFilterType//from   w  w w.j  a  va 2 s  . c  o m
 */
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault,
        StemFilterType stemFilterType) {
    this.tokenizer = tokenizer;
    this.stemFilterType = stemFilterType;
    if (addToDefault) {
        CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);
        ;
        for (String word : stopWords) {
            stopSet.add(word);
        }
        this.stopSet = stopSet;
    } else {
        stopSet = new CharArraySet(stopWords, true);
    }
    tokenStream = createTokenStream(content);
}

From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
    String rules = "foo,bar,baz";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//from ww  w.ja  va 2s  .  c  o  m
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true,
            new StandardAnalyzer(stopSet));
    String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n"
            + "s(100000001,3,'baz',v,1,0).";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//from   ww  w. j a va  2s.c  o m
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processDoc(int docid, String fieldName, Set<String> selector, CharArraySet set)
        throws IOException {
    Terms terms = searcher.getIndexReader().getTermVector(docid, fieldName);
    if (terms != null) {
        TermsEnum te = terms.iterator();
        BytesRef bytes = te.next();//from   w w  w . ja  v  a  2  s  .  c o m
        while (bytes != null) {
            set.add(bytes);
        }
    } else if (analyzer != null) {
        Document document = searcher.doc(docid, selector);
        IndexableField[] fields = document.getFields(fieldName);
        if (fields == null) {
            return;
        }
        for (IndexableField field : fields) {
            String s = field.stringValue();
            //is this possible
            if (s == null) {
                continue;
            }
            processFieldEntry(fieldName, s, set);
        }

    } else {
        throw new IllegalArgumentException(
                "The field must have a term vector or the analyzer must" + " not be null.");
    }
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, s);
    CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from   w  w  w .j  a  va2s  .co  m
    while (ts.incrementToken()) {
        set.add(cattr.toString());
    }
    ts.end();
    ts.close();
}

From source file:reviews.indexing.IndexReviews.java

License:Apache License

private static CharArraySet readStopWords(String filename) {

    CharArraySet stopwords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);

    try {/*from   w ww .  j a va2s.  com*/
        BufferedReader br;
        br = new BufferedReader(new FileReader(new File(filename)));
        String line;
        while ((line = br.readLine()) != null) {
            stopwords.add(line.trim());
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return stopwords;
}