Example usage for org.apache.lucene.analysis CharArraySet copy

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet copy.

Prototype

public static CharArraySet copy(final Set<?> set)

Source Link

Document

Returns a copy of the given set as a CharArraySet .

Usage

From source file:org.apache.jena.query.text.filter.SelectiveFoldingFilter.java

License:Apache License

public SelectiveFoldingFilter(TokenStream input, CharArraySet whitelisted) {
    super(input);
    Objects.requireNonNull(whitelisted, "You must provide the list of whiltelisted characters.");
    this.whitelisted = CharArraySet.unmodifiableSet(CharArraySet.copy(whitelisted));
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

/**
 * Creates a tokenizer based on param values
 * @param content - The text to tokenize
 * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
 * @param stopWords - Provide a set of user defined stop words
 * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
 * If false, then only the user provided words will be used as the stop set
 * @param stemFilterType/*from w w  w.  j a va2  s. c o m*/
 */
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault,
        StemFilterType stemFilterType) {
    this.tokenizer = tokenizer;
    this.stemFilterType = stemFilterType;
    if (addToDefault) {
        CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);
        ;
        for (String word : stopWords) {
            stopSet.add(word);
        }
        this.stopSet = stopSet;
    } else {
        stopSet = new CharArraySet(stopWords, true);
    }
    tokenStream = createTokenStream(content);
}

From source file:org.codelibs.elasticsearch.index.analysis.SnowballAnalyzer.java

License:Apache License

/** Builds the named analyzer with the given stop words. */
public SnowballAnalyzer(String name, CharArraySet stopWords) {
    this(name);// w w  w.j  a va  2 s .  c  o m
    stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords));
}

From source file:org.elasticsearch.analysis.common.SnowballAnalyzer.java

License:Apache License

/** Builds the named analyzer with the given stop words. */
SnowballAnalyzer(String name, CharArraySet stopWords) {
    this(name);
    stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopWords));
}

From source file:org.elasticsearch.analysis.common.WordDelimiterGraphTokenFilterFactory.java

License:Apache License

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
        Settings settings) {/*  w w  w. j  ava  2 s .  co m*/
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}

From source file:org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.java

License:Apache License

public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
        Settings settings) {/*w w  w.  jav  a  2s .co m*/
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}

From source file:org.elasticsearch.index.analysis.Kuromoji2AnalyzerProvider.java

License:Apache License

public Kuromoji2AnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = Kuromoji2TokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = Kuromoji2TokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords),
            JapaneseAnalyzer.getDefaultStopTags());
}

From source file:org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory.java

License:Apache License

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
        Settings settings) {/*from   w  w  w  .j  ava 2s .  c  om*/
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings,
            "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}