intelligentWebAlgorithms.algos.search.lucene.analyzer.CustomAnalyzer.java Source code

Introduction

Here is the source code for intelligentWebAlgorithms.algos.search.lucene.analyzer.CustomAnalyzer.java
Source

/*
 *   ________________________________________________________________________________________
 *   
 *   Y O O R E E K A
 *   A library for data mining, machine learning, soft computing, and mathematical analysis
 *   ________________________________________________________________________________________ 
 *    
 *   The Yooreeka project started with the code of the book "Algorithms of the Intelligent Web " 
 *   (Manning 2009). Although the term "Web" prevailed in the title, in essence, the algorithms 
 *   are valuable in any software application.
 *  
 *   Copyright (c) 2007-2009 Haralambos Marmanis & Dmitry Babenko
 *   Copyright (c) 2009-${year} Marmanis Group LLC and individual contributors as indicated by the @author tags.  
 * 
 *   Certain library functions depend on other Open Source software libraries, which are covered 
 *   by different license agreements. See the NOTICE file distributed with this work for additional 
 *   information regarding copyright ownership and licensing.
 * 
 *   Marmanis Group LLC licenses this file to You under the Apache License, Version 2.0 (the "License"); 
 *   you may not use this file except in compliance with the License.  
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software distributed under 
 *   the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 *   either express or implied. See the License for the specific language governing permissions and
 *   limitations under the License.
 *   
 */
package intelligentWebAlgorithms.algos.search.lucene.analyzer;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;

/**
 * 
 * @author <a href="mailto:babis@marmanis.com">Babis Marmanis</a>
 *
 */
public class CustomAnalyzer extends StopwordAnalyzerBase {

    /** Default maximum allowed token length */
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;

    private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

    /**
     * An unmodifiable set containing some common English words that are usually
     * not useful for searching.
     */
    public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

    private static final String[] ADDITIONAL_STOP_WORDS = { "should", "would", "from", "up", "i", "s", "it", "his",
            "has", "he", "she", "her", "said", "been", "being", "final", "now", "hour", "minute", "second", "stop",
            "start", "first", "third", "fast", "slow", "large", "small" };

    private static CharArraySet MERGED_STOP_WORDS;

    static {
        MERGED_STOP_WORDS = new CharArraySet(Version.LUCENE_44,
                STOP_WORDS_SET.size() + ADDITIONAL_STOP_WORDS.length, true);
    }

    public CustomAnalyzer(Version matchVersion) {
        this(matchVersion, MERGED_STOP_WORDS);
    }

    /**
     * Builds an analyzer with the given stop words.
     * 
     * @param matchVersion
     *            Lucene version to match See
     *            {@link <a href="#version">above</a>}
     * @param stopWords
     *            stop words
     */
    public CustomAnalyzer(Version matchVersion, CharArraySet stopWords) {

        super(matchVersion, stopWords);
    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {

        final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(matchVersion, src);
        tok = new LowerCaseFilter(matchVersion, tok);
        tok = new StopFilter(matchVersion, tok, stopwords);
        return new TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(CustomAnalyzer.this.maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}