Example usage for org.apache.lucene.analysis CharArraySet add

List of usage examples for org.apache.lucene.analysis CharArraySet add

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet add.

Prototype

public boolean add(char[] text) 

Source Link

Document

Add this char[] directly to the set.

Usage

From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java

License:Open Source License

public static CharArraySet getStopWordList(Version luceneVersion, String stopwordList, boolean ignoreCase) {
    String[] terms = stopwordList.split(",");
    CharArraySet stopwordSet = new CharArraySet(luceneVersion, terms.length, ignoreCase);
    for (String term : terms) {
        stopwordSet.add(term);
    }//from w  w  w . java 2s.  c  o m
    return CharArraySet.unmodifiableSet(stopwordSet);
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

/**
 * Creates a tokenizer based on param values
 * @param content - The text to tokenize
 * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
 * @param stopWords - Provide a set of user defined stop words
 * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
 * If false, then only the user provided words will be used as the stop set
 * @param stemFilterType//from   w  w w.j  a  va 2 s  . c  o m
 */
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault,
        StemFilterType stemFilterType) {
    this.tokenizer = tokenizer;
    this.stemFilterType = stemFilterType;
    if (addToDefault) {
        CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);
        ;
        for (String word : stopWords) {
            stopSet.add(word);
        }
        this.stopSet = stopSet;
    } else {
        stopSet = new CharArraySet(stopWords, true);
    }
    tokenStream = createTokenStream(content);
}

From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
    String rules = "foo,bar,baz";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//from ww  w.ja  va 2s  .  c  o  m
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true,
            new StandardAnalyzer(stopSet));
    String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n"
            + "s(100000001,3,'baz',v,1,0).";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//from   ww  w. j a va  2s.c  o m
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processDoc(int docid, String fieldName, Set<String> selector, CharArraySet set)
        throws IOException {
    Terms terms = searcher.getIndexReader().getTermVector(docid, fieldName);
    if (terms != null) {
        TermsEnum te = terms.iterator();
        BytesRef bytes = te.next();//from   w w  w . ja  v  a  2  s  .  c o m
        while (bytes != null) {
            set.add(bytes);
        }
    } else if (analyzer != null) {
        Document document = searcher.doc(docid, selector);
        IndexableField[] fields = document.getFields(fieldName);
        if (fields == null) {
            return;
        }
        for (IndexableField field : fields) {
            String s = field.stringValue();
            //is this possible
            if (s == null) {
                continue;
            }
            processFieldEntry(fieldName, s, set);
        }

    } else {
        throw new IllegalArgumentException(
                "The field must have a term vector or the analyzer must" + " not be null.");
    }
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, s);
    CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from   w  w  w .j  a  va2s  .co  m
    while (ts.incrementToken()) {
        set.add(cattr.toString());
    }
    ts.end();
    ts.close();
}

From source file:reviews.indexing.IndexReviews.java

License:Apache License

private static CharArraySet readStopWords(String filename) {

    CharArraySet stopwords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);

    try {/*from   w ww .  j a va2s.  com*/
        BufferedReader br;
        br = new BufferedReader(new FileReader(new File(filename)));
        String line;
        while ((line = br.readLine()) != null) {
            stopwords.add(line.trim());
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return stopwords;
}