com.bigdata.search.DefaultAnalyzerFactory.java Source code

Introduction

Here is the source code for com.bigdata.search.DefaultAnalyzerFactory.java
Source

/**
    
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.
    
Contact:
 SYSTAP, LLC DBA Blazegraph
 2501 Calvert ST NW #106
 Washington, DC 20008
 licenses@blazegraph.com
    
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Dec 21, 2010
 */

package com.bigdata.search;

import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.util.Version;

import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;

/**
 * This is the default implementation but should be regarded as legacy since
 * it fails to use the correct {@link Analyzer} for almost all languages (other than
 * English). It uses the correct natural language analyzer only for literals tagged with
 * certain three letter ISO 639 codes:
 * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
 * "fra", "fre", "rus" and "tha". All other tags are treated as English.
 * These codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English.
 * No two letter code, other than "en" works correctly: note that the W3C and 
 * IETF recommend the use of the two letter forms instead of the three letter forms.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @deprecated Using {@link ConfigurableAnalyzerFactory} with 
 *    the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT} 
 *    uses the appropriate natural language analyzers for the two letter codes
 *    and for tags which include sub-tags.
 * @version $Id$
 */
public class DefaultAnalyzerFactory implements IAnalyzerFactory {

    private final FullTextIndex fullTextIndex;

    public DefaultAnalyzerFactory(final FullTextIndex fullTextIndex) {

        if (fullTextIndex == null)
            throw new IllegalArgumentException();

        this.fullTextIndex = fullTextIndex;

    }

    public Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {

        final IKeyBuilder keyBuilder = fullTextIndex.getKeyBuilder();

        Map<String, AnalyzerConstructor> map = getAnalyzers();

        AnalyzerConstructor ctor = null;

        if (languageCode == null) {

            if (keyBuilder.isUnicodeSupported()) {

                // The configured local for the database.
                final Locale locale = ((KeyBuilder) keyBuilder).getSortKeyGenerator().getLocale();

                // The analyzer for that locale.
                Analyzer a = getAnalyzer(locale.getLanguage(), filterStopwords);

                if (a != null)
                    return a;

            }

            // fall through

        } else {

            /*
             * Check the declared analyzers. We first check the three letter
             * language code. If we do not have a match there then we check the
             * 2 letter language code.
             */

            String code = languageCode;

            if (code.length() > 3) {

                code = code.substring(0, 2);

                ctor = map.get(languageCode);

            }

            if (ctor == null && code.length() > 2) {

                code = code.substring(0, 1);

                ctor = map.get(languageCode);

            }

        }

        if (ctor == null) {

            // request the default analyzer.

            ctor = map.get("");

            if (ctor == null) {

                throw new IllegalStateException("No entry for empty string?");

            }

        }

        Analyzer a = ctor.newInstance(filterStopwords);

        return a;

    }

    abstract private static class AnalyzerConstructor {

        abstract public Analyzer newInstance(final boolean filterStopwords);

    }

    /**
     * A map containing instances of the various kinds of analyzers that we know
     * about.
     * <p>
     * Note: There MUST be an entry under the empty string (""). This entry will
     * be requested when there is no entry for the specified language code.
     */
    private Map<String, AnalyzerConstructor> analyzers;

    /**
     * Initializes the various kinds of analyzers that we know about.
     * <p>
     * Note: Each {@link Analyzer} is registered under both the 3 letter and the
     * 2 letter language codes. See <a
     * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>.
     * 
     * @todo get some informed advice on which {@link Analyzer}s map onto which
     *       language codes.
     * 
     * @todo thread safety? Analyzers produce token processors so maybe there is
     *       no problem here once things are initialized. If so, maybe this
     *       could be static.
     * 
     * @todo configuration. Could be configured by a file containing a class
     *       name and a list of codes that are handled by that class.
     * 
     * @todo strip language code down to 2/3 characters during lookup.
     * 
     * @todo There are a lot of pidgins based on french, english, and other
     *       languages that are not being assigned here.
     */
    synchronized private Map<String, AnalyzerConstructor> getAnalyzers() {

        if (analyzers != null) {

            return analyzers;

        }

        analyzers = new HashMap<String, AnalyzerConstructor>();

        final Set<?> emptyStopwords = Collections.EMPTY_SET;

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT)
                            : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("por", a);
            analyzers.put("pt", a);
        }

        /*
         * Claims to handle Chinese. Does single character extraction. Claims to
         * produce smaller indices as a result.
         * 
         * Note: you can not tokenize with the Chinese analyzer and the do
         * search using the CJK analyzer and visa versa.
         * 
         * Note: I have no idea whether this would work for Japanese and Korean
         * as well. I expect so, but no real clue.
         */
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return new ChineseAnalyzer();
                }
            };
            analyzers.put("zho", a);
            analyzers.put("chi", a);
            analyzers.put("zh", a);
        }

        /*
         * Claims to handle Chinese, Japanese, Korean. Does double character
         * extraction with overlap.
         */
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT)
                            : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            //            analyzers.put("zho", a);
            //            analyzers.put("chi", a);
            //            analyzers.put("zh", a);
            analyzers.put("jpn", a);
            analyzers.put("ja", a);
            analyzers.put("jpn", a);
            analyzers.put("kor", a);
            analyzers.put("ko", a);
        }

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT)
                            : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("ces", a);
            analyzers.put("cze", a);
            analyzers.put("cs", a);
        }

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT)
                            : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("dut", a);
            analyzers.put("nld", a);
            analyzers.put("nl", a);
        }

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT)
                            : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("fra", a);
            analyzers.put("fre", a);
            analyzers.put("fr", a);
        }

        /*
         * Note: There are a lot of language codes for German variants that
         * might be useful here.
         */
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT)
                            : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("deu", a);
            analyzers.put("ger", a);
            analyzers.put("de", a);
        }

        // Note: ancient greek has a different code (grc).
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT)
                            : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("gre", a);
            analyzers.put("ell", a);
            analyzers.put("el", a);
        }

        // @todo what about other Cyrillic scripts?
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT)
                            : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("rus", a);
            analyzers.put("ru", a);
        }

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return new ThaiAnalyzer(Version.LUCENE_CURRENT);
                }
            };
            analyzers.put("tha", a);
            analyzers.put("th", a);
        }

        // English
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT)
                            : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
                }
            };
            analyzers.put("eng", a);
            analyzers.put("en", a);
            /*
             * Note: There MUST be an entry under the empty string (""). This
             * entry will be requested when there is no entry for the specified
             * language code.
             */
            analyzers.put("", a);
        }

        return analyzers;

    }

}