de.jetwick.es.JetwickAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for de.jetwick.es.JetwickAnalyzer.java

Source

/*
 * Copyright 2011 Peter Karich, jetwick_@_pannous_._info.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.jetwick.es;

import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;

/**
 *
 * @author Peter Karich, jetwick_@_pannous_._info
 */
public class JetwickAnalyzer extends ReusableAnalyzerBase {

    /** 
     * Default maximum allowed token length 
     */
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
    private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
    /**
     * Specifies whether deprecated acronyms should be replaced with HOST type.
     * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
     */
    private final boolean replaceInvalidAcronym;
    protected final Version matchVersion;
    private CharArraySet protectedWords = null;
    private int generateWordParts = 1;
    private int generateNumberParts = 1;
    private int catenateWords = 0;
    private int catenateNumbers = 0;
    private int catenateAll = 0;
    private int splitOnCaseChange = 0;
    private int splitOnNumerics = 1;
    private int preserveOriginal = 1;
    private int stemEnglishPossessive = 0;
    private String handleAsChar = "";
    private String handleAsDigit = "@#$?";

    public JetwickAnalyzer() {
        matchVersion = Version.LUCENE_31;
        replaceInvalidAcronym = true;
    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
        final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
        src.setMaxTokenLength(maxTokenLength);
        src.setReplaceInvalidAcronym(replaceInvalidAcronym);
        TokenStream tok = JetwickFilterFactory.myCreate(src, handleAsChar, handleAsDigit, generateWordParts,
                generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange,
                preserveOriginal, splitOnNumerics, stemEnglishPossessive, protectedWords);
        tok = new LowerCaseFilter(matchVersion, tok);
        return new TokenStreamComponents(src, tok) {

            @Override
            protected boolean reset(final Reader reader) throws IOException {
                src.setMaxTokenLength(JetwickAnalyzer.this.maxTokenLength);
                return super.reset(reader);
            }
        };
    }
}