org.silverpeas.core.index.indexing.model.WAAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for org.silverpeas.core.index.indexing.model.WAAnalyzer.java

Source

/*
 * Copyright (C) 2000 - 2018 Silverpeas
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * As a special exception to the terms and conditions of version 3.0 of
 * the GPL, you may redistribute this Program in connection with Free/Libre
 * Open Source Software ("FLOSS") applications as described in Silverpeas's
 * FLOSS exception.  You should have received a copy of the text describing
 * the FLOSS exception, and it is also available here:
 * "https://www.silverpeas.org/legal/floss_exception.html"
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.silverpeas.core.index.indexing.model;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.silverpeas.core.util.ResourceLocator;
import org.silverpeas.core.util.SettingBundle;
import org.silverpeas.core.util.StringUtil;

import java.util.HashMap;
import java.util.Map;

/**
 * Extends lucene Analyzer : prunes from a tokens stream all the meaningless words and prunes all
 * the special characters.
 */
public final class WAAnalyzer extends Analyzer {

    private static final int LANGUAGE_CODE_LENGTH = 2;
    private static final Map<String, Analyzer> languageMap = new HashMap<String, Analyzer>();
    private static final SettingBundle settings = ResourceLocator
            .getSettingBundle("org.silverpeas.index.indexing.IndexEngine");
    /**
     * The words which are usually not useful for searching.
     */
    private String stemmer = null;
    private boolean snowballUsed = false;
    private String language = null;

    /**
     * The constructor is private
     */
    private WAAnalyzer(String lang) {
        if (!StringUtil.isDefined(lang) || lang.length() != LANGUAGE_CODE_LENGTH) {
            language = settings.getString("analyzer.language.default", "fr");
        } else {
            language = lang;
        }
        stemmer = getStemmer();
        snowballUsed = settings.getBoolean("snowball.active", false);
    }

    /**
     * Returns the analyzer to be used with texts of the given language. The analyzers are cached.
     *
     * @param language
     * @return
     */
    public static Analyzer getAnalyzer(String language) {
        Analyzer analyzer = languageMap.get(language);

        if (analyzer == null) {
            analyzer = new WAAnalyzer(language);
            languageMap.put(language, analyzer);
        }

        return analyzer;
    }

    /**
     * Returns a tokens stream built on top of the given reader.
     *
     */
    @Override
    protected TokenStreamComponents createComponents(final String s) {
        final Tokenizer source = new StandardTokenizer();
        // remove 's and . from token
        TokenStream result = new StandardFilter(source);
        result = new LowerCaseFilter(result);
        // remove some unexplicit terms
        result = new StopFilter(result, FrenchAnalyzer.getDefaultStopSet());
        // remove [cdjlmnst-qu]' from token
        result = new ElisionFilter(result, FrenchAnalyzer.DEFAULT_ARTICLES);
        if (snowballUsed) {
            // Important! Strings given to Snowball filter must contains accents
            // so accents must be removed after stemmer have done the job
            // ignoring singular/plural, male/female and conjugated forms
            result = new SnowballFilter(result, stemmer);
        }
        // remove accents
        result = new ASCIIFoldingFilter(result);
        return new TokenStreamComponents(source, result);
    }

    private String getStemmer() {
        return settings.getString("snowball.stemmer." + language, "French");
    }

    public String getLanguage() {
        return language;
    }

}