Example usage for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter

List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter.

Prototype

public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) 

Source Link

Document

Constructs a ShingleFilter with the specified shingle size from the TokenStream input

Usage

From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.EnAnalyzer.java

License:Apache License

/**
 *
 * @param fieldName//from  ww  w  .jav a 2 s  .  co m
 * @param reader
 * @return
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.warn("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    ShingleFilter sf = new ShingleFilter(result, 2, 3);
    sf.setFillerToken(null);
    return new TokenStreamComponents(t, sf);

}

From source file:ci6226.NGramAnalyzer.java

@Override
protected TokenStreamComponents createComponents(String arg0, Reader reader) {

    Tokenizer source = new StandardTokenizer(Version.LUCENE_47, reader);

    TokenStream filter = new ShingleFilter(source, minGram, maxGram);
    //  filter = new LowerCaseFilter(Version.LUCENE_47, filter);
    //  filter = new StopFilter(Version.LUCENE_47, filter,
    //      StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new TokenStreamComponents(source, filter);
}

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

protected TokenStream createTokenStream(String text) {
    Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet()
            : StopFilter.makeStopSet(LUCENE_VERSION, stopWords);
    Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet);

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
    if (this.nGram) {
        tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram);
    }/*from w  w w.ja v a2s.c o m*/

    return tokenStream;
}

From source file:com.NGramTokenBaseAnalyzer.java

public static ShingleFilter filter(TokenStream tok, boolean unigram) {
    ShingleFilter sf = new ShingleFilter(tok, NGramTokenBaseAnalyzer.min, NGramTokenBaseAnalyzer.max);
    sf.setOutputUnigrams(unigram);/* ww w . j  a va 2  s.  c om*/
    return sf;
}

From source file:eu.edisonproject.utility.text.processing.NGramGenerator.java

private String getNGrams() throws IOException {
    //        List<String> words = new ArrayList<>();

    Analyzer analyzer = new StandardAnalyzer(stopwords);
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(getDescription()));
    StopFilter stopFilter = new StopFilter(tokenStream, stopwords);
    StringBuilder words = new StringBuilder();
    try (ShingleFilter sf = new ShingleFilter(stopFilter, 2, maxNGrams)) {
        sf.setOutputUnigrams(false);/*from w  ww . jav a2s .  c  o  m*/
        CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class);
        sf.reset();
        while (sf.incrementToken()) {
            String word = charTermAttribute.toString();
            word = word.replaceAll("_", " ");
            word = word.replaceAll("\\s{2,}", " ");
            word = word.replaceAll(" ", "_");
            words.append(word).append(" ");
        }
        sf.end();
    }
    words.deleteCharAt(words.length() - 1);
    words.setLength(words.length());

    return words.toString();
}

From source file:lucene.TestAnalyzer.java

protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    String token;/*  w  w w  .j  a  va2s.c  om*/
    TokenStream result = null;

    // Tokenizer source = new WhitespaceTokenizer( Version.LUCENE_CURRENT, reader );
    Tokenizer source = new WhitespaceTokenizer();
    result = new ShingleFilter(source, 2, 2);

    return new TokenStreamComponents(source, result);

}

From source file:nl.uva.p2psearch.Main.java

/**
 *
 * @param e s//from  w  w  w.j  av  a 2 s. c o  m
 * @return s
 * @throws IOException s
 */
private static List<InvertedIndexEntry> getInvertedIndexEntries(final MetadataEntry e) throws IOException {
    String text = e.getDescription();
    Map<Number160, Integer> dictionary = new HashMap<>();
    List<InvertedIndexEntry> list = new ArrayList<>();

    Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, Utils.getCharArrayStopwords());
    try (TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text))) {
        PorterStemFilter psf = new PorterStemFilter(tokenStream);
        CharTermAttribute term = psf.addAttribute(CharTermAttribute.class);
        psf.reset();
        StringBuilder sb = new StringBuilder();
        while (psf.incrementToken()) {
            Integer tf;
            Number160 termKey = Number160.createHash(term.toString());
            if (dictionary.containsKey(termKey)) {
                tf = dictionary.get(termKey);
                tf++;
            } else {
                tf = 1;
            }
            dictionary.put(termKey, tf);
            sb.append(term.toString()).append(" ");
            List<Number160> ll = new ArrayList<>();
            ll.add(Number160.createHash(e.getID()));
            list.add(new InvertedIndexEntry(termKey.toString(), term.toString(), tf, ll));
        }
        StandardTokenizer source = new StandardTokenizer(Version.LUCENE_42, new StringReader(sb.toString()));
        TokenStream tokenStreamSF = new StandardFilter(Version.LUCENE_42, source);
        try (ShingleFilter sf = new ShingleFilter(tokenStreamSF, 2, maxNGrams)) {
            sf.setOutputUnigrams(false);
            CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class);
            sf.reset();
            while (sf.incrementToken()) {
                String word = charTermAttribute.toString();

                String ng = word.replaceAll(" ", "_");
                Integer tf;
                Number160 termKey = Number160.createHash(ng);
                if (dictionary.containsKey(termKey)) {
                    tf = dictionary.get(termKey);
                    tf++;
                } else {
                    tf = 1;
                }
                dictionary.put(termKey, tf);
                List<Number160> ll = new ArrayList<>();
                ll.add(Number160.createHash(e.getID()));
                list.add(new InvertedIndexEntry(termKey.toString(), ng, tf, ll));
            }
        }
    }
    return list;
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> getNGrams(String text, int maxNGrams) throws IOException {
    List<String> words = new ArrayList<>();

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42, CharArraySet.EMPTY_SET);
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
    StopFilter stopFilter = new StopFilter(Version.LUCENE_42, tokenStream, getStopWords());
    stopFilter.setEnablePositionIncrements(false);
    //        SnowballFilter snowballFilter = new SnowballFilter(stopFilter, "English");

    try (ShingleFilter sf = new ShingleFilter(stopFilter, 2, maxNGrams)) {
        sf.setOutputUnigrams(false);//w  w  w . j  a v a  2 s .  co  m
        CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class);
        sf.reset();
        while (sf.incrementToken()) {
            String word = charTermAttribute.toString();
            words.add(word.replaceAll(" ", "_"));
        }
        sf.end();
    }
    return words;
}

From source file:org.apache.james.mailbox.lucene.search.LenientImapSearchAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String arg0, Reader reader) {
    return new ShingleFilter(new UpperCaseFilter(new WhitespaceTokenizer(Version.LUCENE_31, reader)), 2,
            maxTokenLength);//from ww  w. ja v a2  s.  co m
}

From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java

License:Apache License

private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(content));
    tokenStream = new LowerCaseFilter(tokenizer);
    tokenStream = applyStemmer(stemFilterType);
    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
    shingleFilter.setOutputUnigrams(false);
    tokenStream = (TokenStream) shingleFilter;
    return tokenStream;
}