Example usage for org.apache.lucene.analysis.miscellaneous LengthFilter LengthFilter

List of usage examples for org.apache.lucene.analysis.miscellaneous LengthFilter LengthFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous LengthFilter LengthFilter.

Prototype

public LengthFilter(TokenStream in, int min, int max) 

Source Link

Document

Create a new LengthFilter .

Usage

From source file:net.nunoachenriques.vader.text.TokenizerEnglishTest.java

License:Apache License

private List<String> splitWhitespaceLucene(String s) {
    StringReader reader = new StringReader(s);
    org.apache.lucene.analysis.Tokenizer whiteSpaceTokenizer = new WhitespaceTokenizer();
    whiteSpaceTokenizer.setReader(reader);
    ArrayList<String> tokenizedString = null;
    try (TokenStream tokenStream = new LengthFilter(whiteSpaceTokenizer, 2, Integer.MAX_VALUE)) {
        final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();/*from   w ww .  j  a va  2 s .  co m*/
        tokenizedString = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            tokenizedString.add(charTermAttribute.toString());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        ioe.printStackTrace();
    }
    return tokenizedString;
}

From source file:net.nunoachenriques.vader.text.TokenizerEnglishTest.java

License:Apache License

private List<String> cleanPunctuationAndSplitWhitespaceLucene(String s) {
    StringReader reader = new StringReader(s);
    StandardTokenizer removePunctuationTokenizer = new StandardTokenizer();
    removePunctuationTokenizer.setReader(reader);
    ArrayList<String> tokenizedString = null;
    try (TokenStream tokenStream = new LengthFilter(removePunctuationTokenizer, 2, Integer.MAX_VALUE)) {
        final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();//from w w  w  . j  a v  a2  s  . c o  m
        tokenizedString = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            tokenizedString.add(charTermAttribute.toString());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        ioe.printStackTrace();
    }
    return tokenizedString;
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java

License:Apache License

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(//from w  w w . ja v  a 2s . c o m
            PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
            input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false,
            input -> new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false,
            input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    // TODO deprecate edgeNGram
    filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true,
            input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false,
            input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(
            PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false,
            input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton("limit", false,
            input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
                    LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
    // TODO deprecate nGram
    filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false,
            input -> new SnowballFilter(input, "Russian")));
    filters.add(
            PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true,
            ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false,
            input -> new SnowballFilter(input, "English")));
    filters.add(
            PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false,
            input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false,
            input -> new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                            | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
                            | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
                    null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false,
            input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                    | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                    | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}

From source file:org.elasticsearch.analysis.common.LengthTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {
    return new LengthFilter(tokenStream, min, max);
}

From source file:org.lambda3.indra.core.IndraAnalyzer.java

License:Open Source License

private TokenStream createStream(String lang, ModelMetadata metadata, Tokenizer tokenizer) {
    TokenStream stream = new StandardFilter(tokenizer);
    stream = new LengthFilter(stream, metadata.getMinWordLength(), metadata.getMaxWordLength());

    if (metadata.isApplyLowercase()) {
        stream = new LowerCaseFilter(stream);
    }//from   ww w. j a v a 2s .  c  o  m

    if (metadata.isApplyStopWords()) {
        stream = getStopFilter(lang, metadata.getStopWords(), stream);
    }

    if (metadata.getApplyStemmer() > 0) {
        stream = getStemmerFilter(lang, metadata.getApplyStemmer(), stream);
    }

    if (metadata.isRemoveAccents()) {
        stream = new ASCIIFoldingFilter(stream);
    }

    return stream;
}

From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java

License:Open Source License

private TokenStream createStream(CorpusMetadata metadata, Tokenizer tokenizer) {
    TokenStream stream = new StandardFilter(tokenizer);
    stream = new LengthFilter(stream, (int) metadata.minTokenLength, (int) metadata.maxTokenLength);

    if (!metadata.stopWords.isEmpty()) {
        stream = getStopFilter(metadata.language, metadata.stopWords, stream);
    }//from   w  ww.  j a  va  2  s .c o m

    if (metadata.applyStemmer > 0) {
        stream = getStemmerFilter(metadata.language, (int) metadata.applyStemmer, stream);
    }

    if (metadata.removeAccents) {
        stream = new ASCIIFoldingFilter(stream);
    }

    if (metadata.replaceNumbers) {
        stream = new PatternReplaceFilter(stream, NUMBER_PATTERN, NUMBER_PLACEHOLDER, false);
    }

    return stream;
}