org.openedit.data.lucene.AnalyzingQueryParserWithStop.java Source code

Introduction

Here is the source code for org.openedit.data.lucene.AnalyzingQueryParserWithStop.java
Source

package org.openedit.data.lucene;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;

/**
 * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
 * are also passed through the given analyzer, but wildcard characters <code>*</code> and
 * <code>?</code> don't get removed from the search terms.
 * 
 * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
 * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer 
 * will turn <code>H&auml;user</code> into <code>hau</code>, but <code>H?user</code> will 
 * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
 * using this parser will be no improvement over QueryParser in such cases). 
 */
public class AnalyzingQueryParserWithStop extends org.apache.lucene.queryparser.classic.QueryParser {
    // gobble escaped chars or find a wildcard character 
    private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");

    /**
     * @deprecated Use {@link #AnalyzingQueryParser(String, Analyzer)}
     */
    @Deprecated
    public AnalyzingQueryParserWithStop(Version matchVersion, String field, Analyzer analyzer) {
        super(matchVersion, field, analyzer);
        setAnalyzeRangeTerms(true);
    }

    public AnalyzingQueryParserWithStop(String field, Analyzer analyzer) {
        this(Version.LUCENE_46, field, analyzer);
    }

    @Override
    protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException {
        return super.getFuzzyQuery(field, termStr, minSimilarity);
    }

    protected Query XXgetWildcardQuery(String field, String termStr) throws ParseException {
        return super.getWildcardQuery(field, termStr);
    }

    /**
     * Called when parser parses an input term that contains one or more wildcard
     * characters (like <code>*</code>), but is not a prefix term (one that has
     * just a single <code>*</code> character at the end).
     * <p>
     * Example: will be called for <code>H?user</code> or for <code>H*user</code>.
     * <p>
     * Depending on analyzer and settings, a wildcard term may (most probably will)
     * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
     * <p>
     * Overrides super class, by passing terms through analyzer.
     *
     * @param  field   Name of the field query will use.
     * @param  termStr Term that contains one or more wildcard
     *                 characters (? or *), but is not simple prefix term
     *
     * @return Resulting {@link Query} built for the term
     */

    protected Query getWildcardQuery(String field, String termStr) throws ParseException {

        if (termStr == null) {
            //can't imagine this would ever happen
            throw new ParseException("Passed null value as term to getWildcardQuery");
        }
        if (!getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
            throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
                    + " unless getAllowLeadingWildcard() returns true");
        }

        Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
        StringBuilder sb = new StringBuilder();
        int last = 0;

        while (wildcardMatcher.find()) {
            // continue if escaped char
            if (wildcardMatcher.group(1) != null) {
                continue;
            }

            if (wildcardMatcher.start() > 0) {
                String chunk = termStr.substring(last, wildcardMatcher.start());
                String analyzed = analyzeSingleChunk(field, termStr, chunk);
                if (analyzed == null) {
                    continue;
                }
                sb.append(analyzed);
            }
            //append the wildcard character
            sb.append(wildcardMatcher.group(2));

            last = wildcardMatcher.end();
        }
        if (last < termStr.length()) {
            String analyzed = analyzeSingleChunk(field, termStr, termStr.substring(last));
            if (analyzed != null) {
                sb.append(analyzed);
            }
        }
        return super.getWildcardQuery(field, sb.toString());
    }

    /**
     * Called when parser parses an input term
     * that uses prefix notation; that is, contains a single '*' wildcard
     * character as its last character. Since this is a special case
     * of generic wildcard term, and such a query can be optimized easily,
     * this usually results in a different query object.
     * <p>
     * Depending on analyzer and settings, a prefix term may (most probably will)
     * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
     * <p>
     * Overrides super class, by passing terms through analyzer.
     *
     * @param  field   Name of the field query will use.
     * @param  termStr Term to use for building term for the query
     *                 (<b>without</b> trailing '*' character!)
     *
     * @return Resulting {@link Query} built for the term
     */
    @Override
    protected Query getPrefixQuery(String field, String termStr) throws ParseException {
        String analyzed = analyzeSingleChunk(field, termStr, termStr);
        if (analyzed == null) {
            //was a stop word do *
            analyzed = termStr;
        }
        return super.getPrefixQuery(field, analyzed);
    }

    /**
     * Called when parser parses an input term that has the fuzzy suffix (~) appended.
     * <p>
     * Depending on analyzer and settings, a fuzzy term may (most probably will)
     * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
     * <p>
     * Overrides super class, by passing terms through analyzer.
     *
     * @param field Name of the field query will use.
     * @param termStr Term to use for building term for the query
     *
     * @return Resulting {@link Query} built for the term
     */
    /*
    protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
        throws ParseException {
         
      String analyzed = analyzeSingleChunk(field, termStr, termStr);
      return super.getFuzzyQuery(field, analyzed, minSimilarity);
    }
    */

    /**
     * Returns the analyzed form for the given chunk
     * 
     * If the analyzer produces more than one output token from the given chunk,
     * a ParseException is thrown.
     *
     * @param field The target field
     * @param termStr The full term from which the given chunk is excerpted
     * @param chunk The portion of the given termStr to be analyzed
     * @return The result of analyzing the given chunk
     * @throws ParseException when analysis returns other than one output token
     */
    protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException {
        String analyzed = null;
        TokenStream stream = null;
        try {
            stream = getAnalyzer().tokenStream(field, chunk);
            stream.reset();
            CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
            // get first and hopefully only output token
            if (stream.incrementToken()) {
                analyzed = termAtt.toString();

                // try to increment again, there should only be one output token
                StringBuilder multipleOutputs = null;
                while (stream.incrementToken()) {
                    if (null == multipleOutputs) {
                        multipleOutputs = new StringBuilder();
                        multipleOutputs.append('"');
                        multipleOutputs.append(analyzed);
                        multipleOutputs.append('"');
                    }
                    multipleOutputs.append(',');
                    multipleOutputs.append('"');
                    multipleOutputs.append(termAtt.toString());
                    multipleOutputs.append('"');
                }
                stream.end();
                if (null != multipleOutputs) {
                    throw new ParseException(String.format(getLocale(),
                            "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
                }
            } else {
                // nothing returned by analyzer.  Was it a stop word and the user accidentally
                // used an analyzer with stop words?
                stream.end();
                //Need to just ignore this
                return null;
                //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
            }
        } catch (IOException e) {
            throw new ParseException(
                    String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
        } finally {
            IOUtils.closeWhileHandlingException(stream);
        }
        return analyzed;
    }
}