org.silverpeas.search.indexEngine.model.WAAnalyzer.java Source code

Introduction

Here is the source code for org.silverpeas.search.indexEngine.model.WAAnalyzer.java
Source

/**
 * Copyright (C) 2000 - 2013 Silverpeas
 *
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 *
 * As a special exception to the terms and conditions of version 3.0 of the GPL, you may
 * redistribute this Program in connection with Free/Libre Open Source Software ("FLOSS")
 * applications as described in Silverpeas's FLOSS exception. You should have received a copy of the
 * text describing the FLOSS exception, and it is also available here:
 * "http://www.silverpeas.org/docs/core/legal/floss_exception.html"
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see <http://www.gnu.org/licenses/>.
 */
package org.silverpeas.search.indexEngine.model;

import com.silverpeas.util.StringUtil;
import com.silverpeas.util.i18n.I18NHelper;
import com.stratelia.silverpeas.silvertrace.SilverTrace;
import com.stratelia.webactiv.util.ResourceLocator;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.fr.ElisionFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.util.Version;
import org.silverpeas.search.indexEngine.analysis.SilverTokenizer;

import java.io.Reader;
import java.util.*;

/**
 * Extends lucene Analyzer : prunes from a tokens stream all the meaningless words and prunes all
 * the special characters.
 */
public final class WAAnalyzer extends Analyzer {

    /**
     * Returns the analyzer to be used with texts of the given language. The analyzers are cached.
     *
     * @param language
     * @return
     */
    static public Analyzer getAnalyzer(String language) {
        Analyzer analyzer = languageMap.get(language);

        if (analyzer == null) {
            analyzer = new WAAnalyzer(language);
            languageMap.put(language, analyzer);
        }

        return analyzer;
    }

    /**
     * Returns a tokens stream built on top of the given reader.
     *
     * @param reader
     * @return
     */
    public TokenStream tokenStream(Reader reader) {
        TokenStream result = new SilverTokenizer(reader);
        result = new StandardFilter(Version.LUCENE_36, result); // remove 's and . from token
        result = new LowerCaseFilter(Version.LUCENE_36, result);
        result = new StopFilter(Version.LUCENE_36, result, stopWords); // remove some unexplicit terms
        // according to the language
        result = new ElisionFilter(Version.LUCENE_36, result); // remove [cdjlmnst-qu]' from token
        if (snowballUsed) {
            // Important! Strings given to Snowball filter must contains accents
            // so accents must be removed after stemmer have done the job
            // ignoring singular/plural, male/female and conjugated forms
            result = new SnowballFilter(result, stemmer);
        }
        // remove accents
        result = new ASCIIFoldingFilter(result);
        return result;
    }

    @Override
    public TokenStream tokenStream(String arg0, Reader reader) {
        return tokenStream(reader);
    }

    /**
     * The constructor is private
     */
    private WAAnalyzer(String lang) {
        if (!StringUtil.isDefined(lang) || lang.length() != 2) {
            language = settings.getString("analyzer.language.default", "fr");
        } else {
            language = lang;
        }
        getStopWords(language);
        stemmer = getStemmer();
        snowballUsed = settings.getBoolean("snowball.active", false);
    }

    /**
     * Returns an array of words which are not usually usefull for searching.
     */
    private void getStopWords(String language) {
        stopWords = new HashSet<String>();
        String currentLanguage = language;
        try {
            if (!StringUtil.isDefined(currentLanguage)) {
                currentLanguage = I18NHelper.defaultLanguage;
            }
            ResourceLocator resource = new ResourceLocator("org.silverpeas.search.indexEngine.StopWords",
                    currentLanguage);
            Enumeration<String> stopWord = resource.getKeys();
            while (stopWord.hasMoreElements()) {
                stopWords.add(stopWord.nextElement());
            }
        } catch (MissingResourceException e) {
            SilverTrace.warn("indexEngine", "WAAnalyzer", "indexEngine.MSG_MISSING_STOPWORDS_DEFINITION");
        }
    }

    private String getStemmer() {
        return settings.getString("snowball.stemmer." + language, "French");
    }

    public String getLanguage() {
        return language;
    }

    static private final Map<String, Analyzer> languageMap = new HashMap<String, Analyzer>();
    static private final ResourceLocator settings = new ResourceLocator(
            "org.silverpeas.util.indexEngine.IndexEngine", "");
    /**
     * The words which are usually not useful for searching.
     */
    private String stemmer = null;
    private boolean snowballUsed = false;
    private String language = null;
    private Set<String> stopWords = null;
}