luceneprueba.CustomAnalyzers.ReviewAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for luceneprueba.CustomAnalyzers.ReviewAnalyzer.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package luceneprueba.CustomAnalyzers;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;

/**
 *
 * @author Gonzalo
 */

/**
 * Filtros: StandardTokenizer, StandardFilter, LowerCaseFilter y StopFilter
 * usando una lista de stop words en ingles.
 */
public class ReviewAnalyzer extends StopwordAnalyzerBase {

    public static final int DEFAULT_MAX_TOKEN_LENGTH = 50;
    public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

    /** 
     * Construye un analizador con una lista por default de stop words.
     * */
    public ReviewAnalyzer() {
        this(STOP_WORDS_SET);
    }

    /** Constructor del analizar con una lista de stopwords.
     * @param stopWords lista de stop words stop words */
    public ReviewAnalyzer(CharArraySet stopWords) {
        super(stopWords);
    }

    /** Construir un analyzer con una lista de palabras obtenidas de un stop words.
      * @param stopwords Leer palabras de una lista de stopwords */
    public ReviewAnalyzer(Reader stopwords) throws IOException {
        this(loadStopwordSet(stopwords));
    }

    public void setMaxTokenLength(int length) {
        maxTokenLength = length;
    }

    public int getMaxTokenLength() {
        return maxTokenLength;
    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName) {
        final StandardTokenizer src = new StandardTokenizer();
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tokenizer = new StandardFilter(src);
        tokenizer = new LowerCaseFilter(tokenizer);
        tokenizer = new StopFilter(tokenizer, stopwords);
        return new TokenStreamComponents(src, tokenizer) {
            @Override
            protected void setReader(final Reader reader) {
                src.setMaxTokenLength(ReviewAnalyzer.this.maxTokenLength);
                super.setReader(reader);
            }
        };
    }

}