org.opencms.search.galleries.CmsGallerySearchAnalyzer.java Source code

Introduction

Here is the source code for org.opencms.search.galleries.CmsGallerySearchAnalyzer.java
Source

/*
 * This library is part of OpenCms -
 * the Open Source Content Management System
 *
 * Copyright (c) Alkacon Software GmbH (http://www.alkacon.com)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * For further information about Alkacon Software, please see the
 * company website: http://www.alkacon.com
 *
 * For further information about OpenCms, please see the
 * project website: http://www.opencms.org
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.opencms.search.galleries;

import org.opencms.search.CmsSearchIndex;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;

/**
 * Special analyzer for multiple languages, used in the OpenCms gallery search index.<p>
 * 
 * The gallery search is done in one single index that may contain multiple languages.<p>
 * 
 * According to the Lucene JavaDocs (3.0 version), the Lucene {@link org.apache.lucene.analysis.standard.StandardAnalyzer} is already using
 * "a good tokenizer for most European-language documents". The only caveat is that a 
 * list of English only stop words is used.<p>
 * 
 * This extended analyzer used a compound list of stop words compiled from the following languages:<ul>
 * <li>English
 * <li>German
 * <li>Spanish
 * <li>Italian
 * <li>French
 * <li>Portugese
 * <li>Danish
 * <li>Dutch
 * <li>Catalan
 * <li>Czech
 * </ul>
 * 
 * @since 8.0.0 
 */
public class CmsGallerySearchAnalyzer extends StopwordAnalyzerBase {

    /** Default maximum allowed token length. */
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;

    /**
     * Constructor with version parameter.<p>
     * 
     * @param version the Lucene standard analyzer version to match
      * @throws IOException 
     */
    public CmsGallerySearchAnalyzer(Version version) throws IOException {

        // initialize superclass
        super(version,
                WordlistLoader.getWordSet(
                        new BufferedReader(new InputStreamReader(
                                CmsGallerySearchAnalyzer.class.getResourceAsStream("stopwords_multilanguage.txt"))),
                        "#", CmsSearchIndex.LUCENE_VERSION));
    }

    /**
     * @see org.apache.lucene.analysis.ReusableAnalyzerBase#createComponents(java.lang.String, java.io.Reader)
     * 
     * This is take from the Lucene StandardAnalyzer, which is final since 3.1
     */
    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {

        final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
        src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
        TokenStream tok = new StandardFilter(matchVersion, src);
        tok = new LowerCaseFilter(matchVersion, tok);
        tok = new StopFilter(matchVersion, tok, stopwords);
        return new TokenStreamComponents(src, tok) {

            @Override
            protected boolean reset(final Reader r) throws IOException {

                src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
                return super.reset(r);
            }
        };
    }
}