com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory.java Source code

Introduction

Here is the source code for com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory.java
Source

/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 *  2012 LinkedIn Corp. All Rights Reserved.
 */
package com.senseidb.plugin.analyzer;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.collections.MapUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.apache.lucene.util.Version;

import com.senseidb.plugin.SenseiPluginFactory;
import com.senseidb.plugin.SenseiPluginRegistry;

/**
 * A {@code SenseiPluginFactory} that instantiates a {@link PatternAnalyzer}. It can be configured through
 * the following properties:
 *
 * <ul>
 *     <li><code>matchVersion</code>: the Lucene version compatibility (default: LUCENE_35). See {@link Version}.
 *     <li><code>pattern</code>: a regular expression delimiting tokens (default: "<code>\\s+</code>")
 *     <li><code>toLowerCase</code>: if true, tokens are converted to lower case (default: true)
 *     <li><code>stopWords</code>: a comma-separated list of stop words (defaults to an empty list)
 * </ul>
 *
 * Example configuration:
 *
 * <pre>
 * sensei.index.analyzer.class=com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory
 * sensei.index.analyzer.pattern=[ -_./:]
 * </pre>
 *
 * @author jgrande
 *
 */
public class LucenePatternAnalyzerFactory implements SenseiPluginFactory<Analyzer> {

    @Override
    public Analyzer getBean(Map<String, String> initProperties, String fullPrefix,
            SenseiPluginRegistry pluginRegistry) {
        Version matchVersion = Version.valueOf(MapUtils.getString(initProperties, "matchVersion", "LUCENE_35"));
        Pattern pattern = Pattern.compile(MapUtils.getString(initProperties, "pattern", "\\s+"));
        boolean toLowerCase = MapUtils.getBoolean(initProperties, "toLowerCase", true);
        String stopWordsStr = MapUtils.getString(initProperties, "stopWords", "");
        Set<String> stopWords = new HashSet<String>(Arrays.asList(stopWordsStr.split("\\s*,\\s*")));
        return new PatternAnalyzer(matchVersion, pattern, toLowerCase, stopWords);
    }

}