Example usage for org.apache.lucene.analysis CharArraySet CharArraySet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet CharArraySet.

Prototype

public CharArraySet(Collection<?> c, boolean ignoreCase)

Source Link

Document

Creates a set from a Collection of objects.

Usage

From source file:org.apache.jena.query.text.filter.TestSelectiveFoldingFilter.java

License:Apache License

@Test
public void testEmptyInput() throws IOException {
    whitelisted = new CharArraySet(Arrays.asList(""), false);
    inputText = new StringReader("");
    List<String> tokens = collectTokens(inputText, whitelisted);
    List<String> expected = Collections.emptyList();
    assertTrue(tokens.equals(expected));
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

/**
 * Creates a tokenizer based on param values
 * @param content - The text to tokenize
 * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
 * @param stopWords - Provide a set of user defined stop words
 * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
 * If false, then only the user provided words will be used as the stop set
 * @param stemFilterType//from w w w  . j  a  va 2s .c  om
 */
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault,
        StemFilterType stemFilterType) {
    this.tokenizer = tokenizer;
    this.stemFilterType = stemFilterType;
    if (addToDefault) {
        CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);
        ;
        for (String word : stopWords) {
            stopSet.add(word);
        }
        this.stopSet = stopSet;
    } else {
        stopSet = new CharArraySet(stopWords, true);
    }
    tokenStream = createTokenStream(content);
}

From source file:org.apache.solr.analysis.CommonGramsFilter.java

License:Apache License

/**
 * Build a CharArraySet from an array of common words, appropriate for passing
 * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
 * false.//from w w  w . j ava  2  s .com
 * 
 * @param commonWords Array of common words which will be converted into the CharArraySet
 * @param ignoreCase If true, all words are lower cased first.
 * @return a Set containing the words
 * @deprecated create a CharArraySet with CharArraySet instead
 */
@Deprecated
public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
    CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
    commonSet.addAll(Arrays.asList(commonWords));
    return commonSet;
}

From source file:org.apache.solr.analysis.KeepWordFilter.java

License:Apache License

/** @deprecated Use {@link #KeepWordFilter(boolean, TokenStream, CharArraySet)} instead */
@Deprecated//  www .  jav a 2  s. c om
public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase) {
    this(false, in, new CharArraySet(words, ignoreCase));
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

public void testPositionIncrements() throws Exception {
    final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);

    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {
        public TokenStream tokenStream(String field, Reader reader) {
            return new WordDelimiterFilter(new WhitespaceTokenizer(reader), 1, 1, 0, 0, 1, 1, 0, 1, 1,
                    protWords);/*from   w w  w  . jav a2 s .com*/
        }
    };

    /* in this case, works as expected. */
    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 },
            new int[] { 6, 13 }, new int[] { 1, 1 });

    /* only in this case, posInc of 2 ?! */
    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
            new int[] { 0, 9, 12, 9 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 1, 1, 0 });

    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
            new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 1, 1 });

    /* analyzer that will consume tokens with large position increments */
    Analyzer a2 = new Analyzer() {
        public TokenStream tokenStream(String field, Reader reader) {
            return new WordDelimiterFilter(new LargePosIncTokenFilter(new WhitespaceTokenizer(reader)), 1, 1, 0,
                    0, 1, 1, 0, 1, 1, protWords);
        }
    };

    /* increment of "largegap" is preserved */
    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
            new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, new int[] { 1, 10, 1 });

    /* the "/" had a position increment of 10, where did it go?!?!! */
    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 },
            new int[] { 6, 13 }, new int[] { 1, 11 });

    /* in this case, the increment of 10 from the "/" is carried over */
    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
            new int[] { 0, 9, 12, 9 }, new int[] { 6, 12, 13, 13 }, new int[] { 1, 11, 1, 0 });

    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
            new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, new int[] { 1, 11, 1 });

    Analyzer a3 = new Analyzer() {
        public TokenStream tokenStream(String field, Reader reader) {
            StopFilter filter = new StopFilter(new WhitespaceTokenizer(reader),
                    StandardAnalyzer.STOP_WORDS_SET);
            filter.setEnablePositionIncrements(true);
            return new WordDelimiterFilter(filter, 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
        }
    };

    assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucene", "solr", "lucenesolr" }, new int[] { 0, 7, 0 },
            new int[] { 6, 11, 11 }, new int[] { 1, 1, 0 });

    /* the stopword should add a gap here */
    assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucene", "solr", "lucenesolr" },
            new int[] { 4, 11, 4 }, new int[] { 10, 15, 15 }, new int[] { 2, 1, 0 });
}

From source file:org.crosswire.jsword.index.lucene.analysis.AnalyzerFactoryTest.java

License:Open Source License

public void testEngStemming() throws ParseException {
    AbstractBookAnalyzer myAnalyzer = new EnglishLuceneAnalyzer();

    QueryParser parser = new QueryParser(Version.LUCENE_29, field, myAnalyzer);

    String testInput = "Surely will every man walketh";
    Query query = parser.parse(testInput);
    // assertTrue(myAnalyzer instanceof SimpleLuceneAnalyzer);

    // After Diacritic filtering
    assertTrue(query.toString().indexOf(field + ":sure ") > -1);
    assertTrue(query.toString().indexOf(field + ":everi") > -1);

    myAnalyzer.setDoStemming(false);/* www  . j a  v  a2s.  co m*/
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":surely") > -1);
    assertTrue(query.toString().indexOf(field + ":every") > -1);

    // enable stop word
    myAnalyzer.setDoStopWords(true);
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":will") == -1);

    // set custom stop word
    myAnalyzer.setDoStopWords(true);
    String[] stopWords = { "thy", "ye", "unto", "shalt" };
    myAnalyzer.setStopWords(new CharArraySet(Arrays.asList(stopWords), false));
    testInput = "Upon thy belly Shalt thou go";
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":shalt") == -1);
    assertTrue(query.toString().indexOf(field + ":thy") == -1);
    assertTrue(query.toString().indexOf(field + ":upon") > -1);

    System.out.println(query.toString());
}

From source file:org.crosswire.jsword.index.lucene.analysis.EnglishLuceneAnalyzerTest.java

License:Open Source License

public void testSetDoStopWords() throws ParseException {
    String testInput = "Surely will every man walketh";
    Query query = parser.parse(testInput);

    // enable stop word
    myAnalyzer.setDoStopWords(true);//ww  w .ja  v  a 2 s.c om
    query = parser.parse(testInput);
    assertTrue(query.toString().indexOf(field + ":will") == -1);

    // set custom stop word
    myAnalyzer.setDoStopWords(true);
    String[] stopWords = { "thy", "ye", "unto", "shalt" };
    myAnalyzer.setStopWords(new CharArraySet(Arrays.asList(stopWords), false));
    testInput = "Upon thy belly Shalt thou go";
    query = parser.parse(testInput);
    // System.out.println("ParsedQuery- "+ query.toString());

    assertTrue(query.toString().indexOf(field + ":shalt") == -1);
    assertTrue(query.toString().indexOf(field + ":thy") == -1);
    assertTrue(query.toString().indexOf(field + ":upon") > -1);

}

From source file:org.elasticsearch.analysis.common.KeywordMarkerTokenFilterFactory.java

License:Apache License

KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
    String patternString = settings.get("keywords_pattern");
    if (patternString != null) {
        // a pattern for matching keywords is specified, as opposed to a
        // set of keyword strings to match against
        if (settings.get("keywords") != null || settings.get("keywords_path") != null) {
            throw new IllegalArgumentException(
                    "cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
        }//from ww  w  .  jav a 2s  .c  o  m
        keywordPattern = Pattern.compile(patternString);
        keywordLookup = null;
    } else {
        Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
        if (rules == null) {
            throw new IllegalArgumentException("keyword filter requires either `keywords`, `keywords_path`, "
                    + "or `keywords_pattern` to be configured");
        }
        // a set of keywords (or a path to them) is specified
        keywordLookup = new CharArraySet(rules, ignoreCase);
        keywordPattern = null;
    }
}

From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
    String rules = "foo,bar,baz";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);/*from www . j a va2  s  . c o m*/
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true,
            new StandardAnalyzer(stopSet));
    String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n"
            + "s(100000001,3,'baz',v,1,0).";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);/*from  w  w  w . j a  v  a2 s . co  m*/
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}