Example usage for org.apache.lucene.analysis CharArraySet CharArraySet

List of usage examples for org.apache.lucene.analysis CharArraySet CharArraySet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet CharArraySet.

Prototype

public CharArraySet(Collection<?> c, boolean ignoreCase) 

Source Link

Document

Creates a set from a Collection of objects.

Usage

From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java

License:Apache License

@Test
public void testEmptyInput() throws IOException {
    whitelisted = new CharArraySet(Arrays.asList(""), false);
    inputText = new StringReader("");
    List<String> tokens = collectTokens(inputText, whitelisted);
    List<String> expected = Collections.emptyList();
    assertTrue(tokens.equals(expected));
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

/**
 * Creates a tokenizer based on param values
 * @param content - The text to tokenize
 * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
 * @param stopWords - Provide a set of user defined stop words
 * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
 * If false, then only the user provided words will be used as the stop set
 * @param stemFilterType//from w w w  . j  a  va 2s .c  om
 */
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault,
        StemFilterType stemFilterType) {
    this.tokenizer = tokenizer;
    this.stemFilterType = stemFilterType;
    if (addToDefault) {
        CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);
        ;
        for (String word : stopWords) {
            stopSet.add(word);
        }
        this.stopSet = stopSet;
    } else {
        stopSet = new CharArraySet(stopWords, true);
    }
    tokenStream = createTokenStream(content);
}

From source file:org.apache.solr.analysis.CommonGramsFilter.java

License:Apache License

/**
 * Build a CharArraySet from an array of common words, appropriate for passing
 * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
 * false.//from w w  w . j ava  2  s .com
 * 
 * @param commonWords Array of common words which will be converted into the CharArraySet
 * @param ignoreCase If true, all words are lower cased first.
 * @return a Set containing the words
 * @deprecated create a CharArraySet with CharArraySet instead
 */
@Deprecated
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
    CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
    commonSet.addAll(Arrays.asList(commonWords));
    return commonSet;
}

From source file:org.apache.solr.analysis.KeepWordFilter.java

License:Apache License

/** @deprecated Use {@link #KeepWordFilter(boolean, TokenStream, CharArraySet)} instead */
@Deprecated//  www .  jav a 2  s. c om
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase) {
    this(false, in, new CharArraySet(words, ignoreCase));
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

public void testPositionIncrements() throws Exception {
    final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);

    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {
        public TokenStream tokenStream(String field, Reader reader) {
            return new WordDelimiterFilter(new WhitespaceTokenizer(reader), 1, 1, 0, 0, 1, 1, 0, 1, 1,
                    protWords);/*from   w w  w  . jav a2 s .com*/
        }
    };

    /* in this case, works as expected. */
    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 },
            new int[] { 6, 13 }, new int[] { 1, 1 });

    /* only in this case, posInc of 2 ?! */
    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
            new int[] { 0, 9, 12, 9 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 1, 1, 0 });

    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
            new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 1, 1 });

    /* analyzer that will consume tokens with large position increments */
    Analyzer a2 = new Analyzer() {
        public TokenStream tokenStream(String field, Reader reader) {
            return new WordDelimiterFilter(new LargePosIncTokenFilter(new WhitespaceTokenizer(reader)), 1, 1, 0,
                    0, 1, 1, 0, 1, 1, protWords);
        }
    };

    /* increment of "largegap" is preserved */
    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
            new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, new int[] { 1, 10, 1 });

    /* the "/" had a position increment of 10, where did it go?!?!! */
    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 },
            new int[] { 6, 13 }, new int[] { 1, 11 });

    /* in this case, the increment of 10 from the "/" is carried over */
    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
            new int[] { 0, 9, 12, 9 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 11, 1, 0 });

    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
            new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 11, 1 });

    Analyzer a3 = new Analyzer() {
        public TokenStream tokenStream(String field, Reader reader) {
            StopFilter filter = new StopFilter(new WhitespaceTokenizer(reader),
                    StandardAnalyzer.STOP_WORDS_SET);
            filter.setEnablePositionIncrements(true);
            return new WordDelimiterFilter(filter, 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
        }
    };

    assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucene", "solr", "lucenesolr" }, new int[] { 0, 7, 0 },
            new int[] { 6, 11, 11 }, new int[] { 1, 1, 0 });

    /* the stopword should add a gap here */
    assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucene", "solr", "lucenesolr" },
            new int[] { 4, 11, 4 }, new int[] { 10, 15, 15 }, new int[] { 2, 1, 0 });
}

From source file:org.crosswire.jsword.index.lucene.analysis.AnalyzerFactoryTest.java

License:Open Source License

public void testEngStemming() throws ParseException {
    AbstractBookAnalyzer myAnalyzer = new EnglishLuceneAnalyzer();

    QueryParser parser = new QueryParser(Version.LUCENE_29, field, myAnalyzer);

    String testInput = "Surely will every man walketh";
    Query query = parser.parse(testInput);
    // assertTrue(myAnalyzer instanceof SimpleLuceneAnalyzer);

    // After Diacritic filtering
    assertTrue(query.toString().indexOf(field + ":sure ") > -1);
    assertTrue(query.toString().indexOf(field + ":everi") > -1);

    myAnalyzer.setDoStemming(false);/* www  . j a  v  a2s.  co m*/
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":surely") > -1);
    assertTrue(query.toString().indexOf(field + ":every") > -1);

    // enable stop word
    myAnalyzer.setDoStopWords(true);
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":will") == -1);

    // set custom stop word
    myAnalyzer.setDoStopWords(true);
    String[] stopWords = { "thy", "ye", "unto", "shalt" };
    myAnalyzer.setStopWords(new CharArraySet(Arrays.asList(stopWords), false));
    testInput = "Upon thy belly Shalt thou go";
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":shalt") == -1);
    assertTrue(query.toString().indexOf(field + ":thy") == -1);
    assertTrue(query.toString().indexOf(field + ":upon") > -1);

    System.out.println(query.toString());
}

From source file:org.crosswire.jsword.index.lucene.analysis.EnglishLuceneAnalyzerTest.java

License:Open Source License

public void testSetDoStopWords() throws ParseException {
    String testInput = "Surely will every man walketh";
    Query query = parser.parse(testInput);

    // enable stop word
    myAnalyzer.setDoStopWords(true);//ww  w .ja  v  a 2 s.c om
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":will") == -1);

    // set custom stop word
    myAnalyzer.setDoStopWords(true);
    String[] stopWords = { "thy", "ye", "unto", "shalt" };
    myAnalyzer.setStopWords(new CharArraySet(Arrays.asList(stopWords), false));
    testInput = "Upon thy belly Shalt thou go";
    query = parser.parse(testInput);
    // System.out.println("ParsedQuery- "+ query.toString());

    assertTrue(query.toString().indexOf(field + ":shalt") == -1);
    assertTrue(query.toString().indexOf(field + ":thy") == -1);
    assertTrue(query.toString().indexOf(field + ":upon") > -1);

}

From source file:org.elasticsearch.analysis.common.KeywordMarkerTokenFilterFactory.java

License:Apache License

KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
    String patternString = settings.get("keywords_pattern");
    if (patternString != null) {
        // a pattern for matching keywords is specified, as opposed to a
        // set of keyword strings to match against
        if (settings.get("keywords") != null || settings.get("keywords_path") != null) {
            throw new IllegalArgumentException(
                    "cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
        }//from ww  w  .  jav a 2s  .c  o  m
        keywordPattern = Pattern.compile(patternString);
        keywordLookup = null;
    } else {
        Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
        if (rules == null) {
            throw new IllegalArgumentException("keyword filter requires either `keywords`, `keywords_path`, "
                    + "or `keywords_pattern` to be configured");
        }
        // a set of keywords (or a path to them) is specified
        keywordLookup = new CharArraySet(rules, ignoreCase);
        keywordPattern = null;
    }
}

From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
    String rules = "foo,bar,baz";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);/*from www . j a va2  s  . c o m*/
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true,
            new StandardAnalyzer(stopSet));
    String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n"
            + "s(100000001,3,'baz',v,1,0).";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);/*from  w  w  w . j a  v  a2 s . co  m*/
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}