org.apache.lucene.queryparser.simple.SimpleQueryParser.java Source code

Introduction

Here is the source code for org.apache.lucene.queryparser.simple.SimpleQueryParser.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.queryparser.simple;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

import java.util.Collections;
import java.util.Map;

/**
 * SimpleQueryParser is used to parse human readable query syntax.
 * <p>
 * The main idea behind this parser is that a person should be able to type
 * whatever they want to represent a query, and this parser will do its best
 * to interpret what to search for no matter how poorly composed the request
 * may be. Tokens are considered to be any of a term, phrase, or subquery for the
 * operations described below.  Whitespace including ' ' '\n' '\r' and '\t'
 * and certain operators may be used to delimit tokens ( ) + | " .
 * <p>
 * Any errors in query syntax will be ignored and the parser will attempt
 * to decipher what it can; however, this may mean odd or unexpected results.
 * <p>
 * <b>Query Operators</b>
 * <ul>
 *  <li>'{@code +}' specifies {@code AND} operation: <tt>token1+token2</tt>
 *  <li>'{@code |}' specifies {@code OR} operation: <tt>token1|token2</tt>
 *  <li>'{@code -}' negates a single token: <tt>-token0</tt>
 *  <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
 *  <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
 *  <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
 *  <li>'{@code ~}N' at the end of phrases specifies near query: <tt>"term1 term2"~5</tt>
 *  <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
 * </ul>
 * <p>
 * The {@link #setDefaultOperator default operator} is {@code OR} if no other operator is specified.
 * For example, the following will {@code OR} {@code token1} and {@code token2} together:
 * <tt>token1 token2</tt>
 * <p>
 * Normal operator precedence will be simple order from right to left.
 * For example, the following will evaluate {@code token1 OR token2} first,
 * then {@code AND} with {@code token3}:
 * <blockquote>token1 | token2 + token3</blockquote>
 * <b>Escaping</b>
 * <p>
 * An individual term may contain any possible character with certain characters
 * requiring escaping using a '{@code \}'.  The following characters will need to be escaped in
 * terms and phrases:
 * {@code + | " ( ) ' \}
 * <p>
 * The '{@code -}' operator is a special case.  On individual terms (not phrases) the first
 * character of a term that is {@code -} must be escaped; however, any '{@code -}' characters
 * beyond the first character do not need to be escaped.
 * For example:
 * <ul>
 *   <li>{@code -term1}   -- Specifies {@code NOT} operation against {@code term1}
 *   <li>{@code \-term1}  -- Searches for the term {@code -term1}.
 *   <li>{@code term-1}   -- Searches for the term {@code term-1}.
 *   <li>{@code term\-1}  -- Searches for the term {@code term-1}.
 * </ul>
 * <p>
 * The '{@code *}' operator is a special case. On individual terms (not phrases) the last
 * character of a term that is '{@code *}' must be escaped; however, any '{@code *}' characters
 * before the last character do not need to be escaped:
 * <ul>
 *   <li>{@code term1*}  --  Searches for the prefix {@code term1}
 *   <li>{@code term1\*} --  Searches for the term {@code term1*}
 *   <li>{@code term*1}  --  Searches for the term {@code term*1}
 *   <li>{@code term\*1} --  Searches for the term {@code term*1}
 * </ul>
 * <p>
 * Note that above examples consider the terms before text processing.
 */
public class SimpleQueryParser extends QueryBuilder {
    /** Map of fields to query against with their weights */
    protected final Map<String, Float> weights;
    /** flags to the parser (to turn features on/off) */
    protected final int flags;

    /** Enables {@code AND} operator (+) */
    public static final int AND_OPERATOR = 1 << 0;
    /** Enables {@code NOT} operator (-) */
    public static final int NOT_OPERATOR = 1 << 1;
    /** Enables {@code OR} operator (|) */
    public static final int OR_OPERATOR = 1 << 2;
    /** Enables {@code PREFIX} operator (*) */
    public static final int PREFIX_OPERATOR = 1 << 3;
    /** Enables {@code PHRASE} operator (") */
    public static final int PHRASE_OPERATOR = 1 << 4;
    /** Enables {@code PRECEDENCE} operators: {@code (} and {@code )} */
    public static final int PRECEDENCE_OPERATORS = 1 << 5;
    /** Enables {@code ESCAPE} operator (\) */
    public static final int ESCAPE_OPERATOR = 1 << 6;
    /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
    public static final int WHITESPACE_OPERATOR = 1 << 7;
    /** Enables {@code FUZZY} operators: (~) on single terms */
    public static final int FUZZY_OPERATOR = 1 << 8;
    /** Enables {@code NEAR} operators: (~) on phrases */
    public static final int NEAR_OPERATOR = 1 << 9;

    private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;

    /** Creates a new parser searching over a single field. */
    public SimpleQueryParser(Analyzer analyzer, String field) {
        this(analyzer, Collections.singletonMap(field, 1.0F));
    }

    /** Creates a new parser searching over multiple fields with different weights. */
    public SimpleQueryParser(Analyzer analyzer, Map<String, Float> weights) {
        this(analyzer, weights, -1);
    }

    /** Creates a new parser with custom flags used to enable/disable certain features. */
    public SimpleQueryParser(Analyzer analyzer, Map<String, Float> weights, int flags) {
        super(analyzer);
        this.weights = weights;
        this.flags = flags;
    }

    /** Parses the query text and returns parsed query */
    public Query parse(String queryText) {
        if ("*".equals(queryText.trim())) {
            return new MatchAllDocsQuery();
        }

        char data[] = queryText.toCharArray();
        char buffer[] = new char[data.length];

        State state = new State(data, buffer, 0, data.length);
        parseSubQuery(state);
        if (state.top == null) {
            return new MatchNoDocsQuery("empty string passed to query parser");
        } else {
            return state.top;
        }
    }

    private void parseSubQuery(State state) {
        while (state.index < state.length) {
            if (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) {
                // the beginning of a subquery has been found
                consumeSubQuery(state);
            } else if (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) {
                // this is an extraneous character so it is ignored
                ++state.index;
            } else if (state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) {
                // the beginning of a phrase has been found
                consumePhrase(state);
            } else if (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) {
                // an and operation has been explicitly set
                // if an operation has already been set this one is ignored
                // if a term (or phrase or subquery) has not been found yet the
                // operation is also ignored since there is no previous
                // term (or phrase or subquery) to and with
                if (state.currentOperation == null && state.top != null) {
                    state.currentOperation = BooleanClause.Occur.MUST;
                }

                ++state.index;
            } else if (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) {
                // an or operation has been explicitly set
                // if an operation has already been set this one is ignored
                // if a term (or phrase or subquery) has not been found yet the
                // operation is also ignored since there is no previous
                // term (or phrase or subquery) to or with
                if (state.currentOperation == null && state.top != null) {
                    state.currentOperation = BooleanClause.Occur.SHOULD;
                }

                ++state.index;
            } else if (state.data[state.index] == '-' && (flags & NOT_OPERATOR) != 0) {
                // a not operator has been found, so increase the not count
                // two not operators in a row negate each other
                ++state.not;
                ++state.index;

                // continue so the not operator is not reset
                // before the next character is determined
                continue;
            } else if ((state.data[state.index] == ' ' || state.data[state.index] == '\t'
                    || state.data[state.index] == '\n' || state.data[state.index] == '\r')
                    && (flags & WHITESPACE_OPERATOR) != 0) {
                // ignore any whitespace found as it may have already been
                // used a delimiter across a term (or phrase or subquery)
                // or is simply extraneous
                ++state.index;
            } else {
                // the beginning of a token has been found
                consumeToken(state);
            }

            // reset the not operator as even whitespace is not allowed when
            // specifying the not operation for a term (or phrase or subquery)
            state.not = 0;
        }
    }

    private void consumeSubQuery(State state) {
        assert (flags & PRECEDENCE_OPERATORS) != 0;
        int start = ++state.index;
        int precedence = 1;
        boolean escaped = false;

        while (state.index < state.length) {
            if (!escaped) {
                if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
                    // an escape character has been found so
                    // whatever character is next will become
                    // part of the subquery unless the escape
                    // character is the last one in the data
                    escaped = true;
                    ++state.index;

                    continue;
                } else if (state.data[state.index] == '(') {
                    // increase the precedence as there is a
                    // subquery in the current subquery
                    ++precedence;
                } else if (state.data[state.index] == ')') {
                    --precedence;

                    if (precedence == 0) {
                        // this should be the end of the subquery
                        // all characters found will used for
                        // creating the subquery
                        break;
                    }
                }
            }

            escaped = false;
            ++state.index;
        }

        if (state.index == state.length) {
            // a closing parenthesis was never found so the opening
            // parenthesis is considered extraneous and will be ignored
            state.index = start;
        } else if (state.index == start) {
            // a closing parenthesis was found immediately after the opening
            // parenthesis so the current operation is reset since it would
            // have been applied to this subquery
            state.currentOperation = null;

            ++state.index;
        } else {
            // a complete subquery has been found and is recursively parsed by
            // starting over with a new state object
            State subState = new State(state.data, state.buffer, start, state.index);
            parseSubQuery(subState);
            buildQueryTree(state, subState.top);

            ++state.index;
        }
    }

    private void consumePhrase(State state) {
        assert (flags & PHRASE_OPERATOR) != 0;
        int start = ++state.index;
        int copied = 0;
        boolean escaped = false;
        boolean hasSlop = false;

        while (state.index < state.length) {
            if (!escaped) {
                if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
                    // an escape character has been found so
                    // whatever character is next will become
                    // part of the phrase unless the escape
                    // character is the last one in the data
                    escaped = true;
                    ++state.index;

                    continue;
                } else if (state.data[state.index] == '"') {
                    // if there are still characters after the closing ", check for a
                    // tilde
                    if (state.length > (state.index + 1) && state.data[state.index + 1] == '~'
                            && (flags & NEAR_OPERATOR) != 0) {
                        state.index++;
                        // check for characters after the tilde
                        if (state.length > (state.index + 1)) {
                            hasSlop = true;
                        }
                        break;
                    } else {
                        // this should be the end of the phrase
                        // all characters found will used for
                        // creating the phrase query
                        break;
                    }
                }
            }

            escaped = false;
            state.buffer[copied++] = state.data[state.index++];
        }

        if (state.index == state.length) {
            // a closing double quote was never found so the opening
            // double quote is considered extraneous and will be ignored
            state.index = start;
        } else if (state.index == start) {
            // a closing double quote was found immediately after the opening
            // double quote so the current operation is reset since it would
            // have been applied to this phrase
            state.currentOperation = null;

            ++state.index;
        } else {
            // a complete phrase has been found and is parsed through
            // through the analyzer from the given field
            String phrase = new String(state.buffer, 0, copied);
            Query branch;
            if (hasSlop) {
                branch = newPhraseQuery(phrase, parseFuzziness(state));
            } else {
                branch = newPhraseQuery(phrase, 0);
            }
            buildQueryTree(state, branch);

            ++state.index;
        }
    }

    private void consumeToken(State state) {
        int copied = 0;
        boolean escaped = false;
        boolean prefix = false;
        boolean fuzzy = false;

        while (state.index < state.length) {
            if (!escaped) {
                if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
                    // an escape character has been found so
                    // whatever character is next will become
                    // part of the term unless the escape
                    // character is the last one in the data
                    escaped = true;
                    prefix = false;
                    ++state.index;

                    continue;
                } else if (tokenFinished(state)) {
                    // this should be the end of the term
                    // all characters found will used for
                    // creating the term query
                    break;
                } else if (copied > 0 && state.data[state.index] == '~' && (flags & FUZZY_OPERATOR) != 0) {
                    fuzzy = true;
                    break;
                }

                // wildcard tracks whether or not the last character
                // was a '*' operator that hasn't been escaped
                // there must be at least one valid character before
                // searching for a prefixed set of terms
                prefix = copied > 0 && state.data[state.index] == '*' && (flags & PREFIX_OPERATOR) != 0;
            }

            escaped = false;
            state.buffer[copied++] = state.data[state.index++];
        }

        if (copied > 0) {
            final Query branch;

            if (fuzzy && (flags & FUZZY_OPERATOR) != 0) {
                String token = new String(state.buffer, 0, copied);
                int fuzziness = parseFuzziness(state);
                // edit distance has a maximum, limit to the maximum supported
                fuzziness = Math.min(fuzziness, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
                if (fuzziness == 0) {
                    branch = newDefaultQuery(token);
                } else {
                    branch = newFuzzyQuery(token, fuzziness);
                }
            } else if (prefix) {
                // if a term is found with a closing '*' it is considered to be a prefix query
                // and will have prefix added as an option
                String token = new String(state.buffer, 0, copied - 1);
                branch = newPrefixQuery(token);
            } else {
                // a standard term has been found so it will be run through
                // the entire analysis chain from the specified schema field
                String token = new String(state.buffer, 0, copied);
                branch = newDefaultQuery(token);
            }

            buildQueryTree(state, branch);
        }
    }

    private static BooleanQuery addClause(BooleanQuery bq, Query query, BooleanClause.Occur occur) {
        BooleanQuery.Builder newBq = new BooleanQuery.Builder();
        newBq.setMinimumNumberShouldMatch(bq.getMinimumNumberShouldMatch());
        for (BooleanClause clause : bq) {
            newBq.add(clause);
        }
        newBq.add(query, occur);
        return newBq.build();
    }

    // buildQueryTree should be called after a term, phrase, or subquery
    // is consumed to be added to our existing query tree
    // this method will only add to the existing tree if the branch contained in state is not null
    private void buildQueryTree(State state, Query branch) {
        if (branch != null) {
            // modify our branch to a BooleanQuery wrapper for not
            // this is necessary any time a term, phrase, or subquery is negated
            if (state.not % 2 == 1) {
                BooleanQuery.Builder nq = new BooleanQuery.Builder();
                nq.add(branch, BooleanClause.Occur.MUST_NOT);
                nq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
                branch = nq.build();
            }

            // first term (or phrase or subquery) found and will begin our query tree
            if (state.top == null) {
                state.top = branch;
            } else {
                // more than one term (or phrase or subquery) found
                // set currentOperation to the default if no other operation is explicitly set
                if (state.currentOperation == null) {
                    state.currentOperation = defaultOperator;
                }

                // operational change requiring a new parent node
                // this occurs if the previous operation is not the same as current operation
                // because the previous operation must be evaluated separately to preserve
                // the proper precedence and the current operation will take over as the top of the tree
                if (state.previousOperation != state.currentOperation) {
                    BooleanQuery.Builder bq = new BooleanQuery.Builder();
                    bq.add(state.top, state.currentOperation);
                    state.top = bq.build();
                }

                // reset all of the state for reuse
                state.top = addClause((BooleanQuery) state.top, branch, state.currentOperation);
                state.previousOperation = state.currentOperation;
            }

            // reset the current operation as it was intended to be applied to
            // the incoming term (or phrase or subquery) even if branch was null
            // due to other possible errors
            state.currentOperation = null;
        }
    }

    /**
     * Helper parsing fuzziness from parsing state
     * @return slop/edit distance, 0 in the case of non-parsing slop/edit string
     */
    private int parseFuzziness(State state) {
        char slopText[] = new char[state.length];
        int slopLength = 0;

        if (state.data[state.index] == '~') {
            while (state.index < state.length) {
                state.index++;
                // it's possible that the ~ was at the end, so check after incrementing
                // to make sure we don't go out of bounds
                if (state.index < state.length) {
                    if (tokenFinished(state)) {
                        break;
                    }
                    slopText[slopLength] = state.data[state.index];
                    slopLength++;
                }
            }
            int fuzziness = 0;
            try {
                String fuzzyString = new String(slopText, 0, slopLength);
                if ("".equals(fuzzyString)) {
                    // Use automatic fuzziness, ~2
                    fuzziness = 2;
                } else {
                    fuzziness = Integer.parseInt(fuzzyString);
                }
            } catch (NumberFormatException e) {
                // swallow number format exceptions parsing fuzziness
            }
            // negative -> 0
            if (fuzziness < 0) {
                fuzziness = 0;
            }
            return fuzziness;
        }
        return 0;
    }

    /**
     * Helper returning true if the state has reached the end of token.
     */
    private boolean tokenFinished(State state) {
        if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
                || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
                || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
                || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
                || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
                || ((state.data[state.index] == ' ' || state.data[state.index] == '\t'
                        || state.data[state.index] == '\n' || state.data[state.index] == '\r')
                        && (flags & WHITESPACE_OPERATOR) != 0)) {
            return true;
        }
        return false;
    }

    /**
     * Factory method to generate a standard query (no phrase or prefix operators).
     */
    protected Query newDefaultQuery(String text) {
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        for (Map.Entry<String, Float> entry : weights.entrySet()) {
            Query q = createBooleanQuery(entry.getKey(), text, defaultOperator);
            if (q != null) {
                float boost = entry.getValue();
                if (boost != 1f) {
                    q = new BoostQuery(q, boost);
                }
                bq.add(q, BooleanClause.Occur.SHOULD);
            }
        }
        return simplify(bq.build());
    }

    /**
     * Factory method to generate a fuzzy query.
     */
    protected Query newFuzzyQuery(String text, int fuzziness) {
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        for (Map.Entry<String, Float> entry : weights.entrySet()) {
            final String fieldName = entry.getKey();
            final BytesRef term = getAnalyzer().normalize(fieldName, text);
            Query q = new FuzzyQuery(new Term(fieldName, term), fuzziness);
            float boost = entry.getValue();
            if (boost != 1f) {
                q = new BoostQuery(q, boost);
            }
            bq.add(q, BooleanClause.Occur.SHOULD);
        }
        return simplify(bq.build());
    }

    /**
     * Factory method to generate a phrase query with slop.
     */
    protected Query newPhraseQuery(String text, int slop) {
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        for (Map.Entry<String, Float> entry : weights.entrySet()) {
            Query q = createPhraseQuery(entry.getKey(), text, slop);
            if (q != null) {
                float boost = entry.getValue();
                if (boost != 1f) {
                    q = new BoostQuery(q, boost);
                }
                bq.add(q, BooleanClause.Occur.SHOULD);
            }
        }
        return simplify(bq.build());
    }

    /**
     * Factory method to generate a prefix query.
     */
    protected Query newPrefixQuery(String text) {
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        for (Map.Entry<String, Float> entry : weights.entrySet()) {
            final String fieldName = entry.getKey();
            final BytesRef term = getAnalyzer().normalize(fieldName, text);
            Query q = new PrefixQuery(new Term(fieldName, term));
            float boost = entry.getValue();
            if (boost != 1f) {
                q = new BoostQuery(q, boost);
            }
            bq.add(q, BooleanClause.Occur.SHOULD);
        }
        return simplify(bq.build());
    }

    /**
     * Helper to simplify boolean queries with 0 or 1 clause
     */
    protected Query simplify(BooleanQuery bq) {
        if (bq.clauses().isEmpty()) {
            return null;
        } else if (bq.clauses().size() == 1) {
            return bq.clauses().iterator().next().getQuery();
        } else {
            return bq;
        }
    }

    /**
     * Returns the implicit operator setting, which will be
     * either {@code SHOULD} or {@code MUST}.
     */
    public BooleanClause.Occur getDefaultOperator() {
        return defaultOperator;
    }

    /**
     * Sets the implicit operator setting, which must be
     * either {@code SHOULD} or {@code MUST}.
     */
    public void setDefaultOperator(BooleanClause.Occur operator) {
        if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) {
            throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed");
        }
        this.defaultOperator = operator;
    }

    static class State {
        final char[] data; // the characters in the query string
        final char[] buffer; // a temporary buffer used to reduce necessary allocations
        int index;
        int length;

        BooleanClause.Occur currentOperation;
        BooleanClause.Occur previousOperation;
        int not;

        Query top;

        State(char[] data, char[] buffer, int index, int length) {
            this.data = data;
            this.buffer = buffer;
            this.index = index;
            this.length = length;
        }
    }
}