Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from Lucene 4.2 version to produce TextPattern instead of Query. * * Changed (blocks of) lines are marked with "// BL", except for lines where * the only change is to replace "Query" with "TextPattern". */ package org.allenai.blacklab.queryParser.lucene; // BL: changed package import java.io.IOException; import java.io.StringReader; import java.text.DateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.TimeZone; import org.allenai.blacklab.queryParser.lucene.helpers.TPMatchAllDocs; import org.allenai.blacklab.queryParser.lucene.helpers.TPMultiPhrase; import org.allenai.blacklab.queryParser.lucene.helpers.TPPhrase; import org.allenai.blacklab.queryParser.lucene.helpers.TPTermRange; import org.allenai.blacklab.search.TPBooleanClause; import org.allenai.blacklab.search.TextPattern; import org.allenai.blacklab.search.TextPatternBoolean; import org.allenai.blacklab.search.TextPatternFuzzy; import org.allenai.blacklab.search.TextPatternPrefix; import org.allenai.blacklab.search.TextPatternRegex; import org.allenai.blacklab.search.TextPatternTerm; import org.allenai.blacklab.search.TextPatternWildcard; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser.Operator; import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery.TooManyClauses; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; // BL imports /** This class is overridden by QueryParser in QueryParser.jj * and acts to separate the majority of the Java code from the .jj grammar file. */ @SuppressWarnings({ "all" }) // BL: "resource" because of possible resource leak public abstract class QueryParserBase implements CommonQueryParserConfiguration { /** Do not catch this exception in your code, it means you are using methods that you should no longer use. */ public static class MethodRemovedUseAnother extends Throwable { } static final int CONJ_NONE = 0; static final int CONJ_AND = 1; static final int CONJ_OR = 2; static final int MOD_NONE = 0; static final int MOD_NOT = 10; static final int MOD_REQ = 11; // make it possible to call setDefaultOperator() without accessing // the nested class: /** Alternative form of QueryParser.Operator.AND */ public static final Operator AND_OPERATOR = Operator.AND; /** Alternative form of QueryParser.Operator.OR */ public static final Operator OR_OPERATOR = Operator.OR; /** The actual operator that parser uses to combine query terms */ Operator operator = OR_OPERATOR; boolean lowercaseExpandedTerms = true; MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; boolean allowLeadingWildcard = false; boolean enablePositionIncrements = true; Analyzer analyzer; String field; int phraseSlop = 0; float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; Locale locale = Locale.getDefault(); TimeZone timeZone = TimeZone.getDefault(); // the default date resolution DateTools.Resolution dateResolution = null; // maps field names to date resolutions Map<String, DateTools.Resolution> fieldToDateResolution = null; //Whether or not to analyze range terms when constructing RangeQuerys // (For example, analyzing terms into collation keys for locale-sensitive RangeQuery) boolean analyzeRangeTerms = false; boolean autoGeneratePhraseQueries; // So the generated QueryParser(CharStream) won't error out protected QueryParserBase() { } /** Initializes a query parser. Called by the QueryParser constructor * @param matchVersion Lucene version to match. See <a href="QueryParser.html#version">here</a>. * @param f the default field for query terms. * @param a used to find terms in the query text. */ public void init(Version matchVersion, String f, Analyzer a) { analyzer = a; field = f; if (matchVersion.onOrAfter(Version.LUCENE_31)) { setAutoGeneratePhraseQueries(false); } else { setAutoGeneratePhraseQueries(true); } } // the generated parser will create these in QueryParser public abstract void ReInit(CharStream stream); public abstract TextPattern TopLevelQuery(String field) throws ParseException; /** Parses a query string, returning a {@link TextPattern}. * @param query the query string to be parsed. * @throws ParseException if the parsing fails */ public TextPattern parse(String query) throws ParseException { ReInit(new FastCharStream(new StringReader(query))); try { // TopLevelQuery is a Query followed by the end-of-input (EOF) TextPattern res = TopLevelQuery(field); return res != null ? res : newBooleanQuery(false); } catch (ParseException tme) { // rethrow to include the original query: ParseException e = new ParseException("Cannot parse '" + query + "': " + tme.getMessage()); e.initCause(tme); throw e; } catch (TokenMgrError tme) { ParseException e = new ParseException("Cannot parse '" + query + "': " + tme.getMessage()); e.initCause(tme); throw e; } catch (BooleanQuery.TooManyClauses tmc) { ParseException e = new ParseException("Cannot parse '" + query + "': too many boolean clauses"); e.initCause(tmc); throw e; } } /** * @return Returns the analyzer. */ @Override public Analyzer getAnalyzer() { return analyzer; } /** * @return Returns the default field. */ public String getField() { return field; } /** * @see #setAutoGeneratePhraseQueries(boolean) */ public final boolean getAutoGeneratePhraseQueries() { return autoGeneratePhraseQueries; } /** * Set to true if phrase queries will be automatically generated * when the analyzer returns more than one term from whitespace * delimited text. * NOTE: this behavior may not be suitable for all languages. * <p> * Set to false if phrase queries should only be generated when * surrounded by double quotes. */ public final void setAutoGeneratePhraseQueries(boolean value) { this.autoGeneratePhraseQueries = value; } /** * Get the minimal similarity for fuzzy queries. */ @Override public float getFuzzyMinSim() { return fuzzyMinSim; } /** * Set the minimum similarity for fuzzy queries. * Default is 2f. */ @Override public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; } /** * Get the prefix length for fuzzy queries. * @return Returns the fuzzyPrefixLength. */ @Override public int getFuzzyPrefixLength() { return fuzzyPrefixLength; } /** * Set the prefix length for fuzzy queries. Default is 0. * @param fuzzyPrefixLength The fuzzyPrefixLength to set. */ @Override public void setFuzzyPrefixLength(int fuzzyPrefixLength) { this.fuzzyPrefixLength = fuzzyPrefixLength; } /** * Sets the default slop for phrases. If zero, then exact phrase matches * are required. Default value is zero. */ @Override public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; } /** * Gets the default slop for phrases. */ @Override public int getPhraseSlop() { return phraseSlop; } /** * Set to <code>true</code> to allow leading wildcard characters. * <p> * When set, <code>*</code> or <code>?</code> are allowed as * the first character of a PrefixQuery and WildcardQuery. * Note that this can produce very slow * queries on big indexes. * <p> * Default: false. */ @Override public void setAllowLeadingWildcard(boolean allowLeadingWildcard) { this.allowLeadingWildcard = allowLeadingWildcard; } /** * @see #setAllowLeadingWildcard(boolean) */ @Override public boolean getAllowLeadingWildcard() { return allowLeadingWildcard; } /** * Set to <code>true</code> to enable position increments in result query. * <p> * When set, result phrase and multi-phrase queries will * be aware of position increments. * Useful when e.g. a StopFilter increases the position increment of * the token that follows an omitted token. * <p> * Default: true. */ @Override public void setEnablePositionIncrements(boolean enable) { this.enablePositionIncrements = enable; } /** * @see #setEnablePositionIncrements(boolean) */ @Override public boolean getEnablePositionIncrements() { return enablePositionIncrements; } /** * Sets the boolean operator of the QueryParser. * In default mode (<code>OR_OPERATOR</code>) terms without any modifiers * are considered optional: for example <code>capital of Hungary</code> is equal to * <code>capital OR of OR Hungary</code>.<br/> * In <code>AND_OPERATOR</code> mode terms are considered to be in conjunction: the * above mentioned query is parsed as <code>capital AND of AND Hungary</code> */ public void setDefaultOperator(Operator op) { this.operator = op; } /** * Gets implicit operator setting, which will be either AND_OPERATOR * or OR_OPERATOR. */ public Operator getDefaultOperator() { return operator; } /** * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically * lower-cased or not. Default is <code>true</code>. */ @Override public void setLowercaseExpandedTerms(boolean lowercaseExpandedTerms) { this.lowercaseExpandedTerms = lowercaseExpandedTerms; } /** * @see #setLowercaseExpandedTerms(boolean) */ @Override public boolean getLowercaseExpandedTerms() { return lowercaseExpandedTerms; } /** * By default QueryParser uses {@link org.apache.lucene.search.MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * when creating a {@link PrefixQuery}, {@link WildcardQuery} or {@link TermRangeQuery}. This implementation is generally preferable because it * a) Runs faster b) Does not have the scarcity of terms unduly influence score * c) avoids any {@link TooManyClauses} exception. * However, if your application really needs to use the * old-fashioned {@link BooleanQuery} expansion rewriting and the above * points are not relevant then use this to change * the rewrite method. */ @Override public void setMultiTermRewriteMethod(MultiTermQuery.RewriteMethod method) { multiTermRewriteMethod = method; } /** * @see #setMultiTermRewriteMethod */ @Override public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() { return multiTermRewriteMethod; } /** * Set locale used by date range parsing, lowercasing, and other * locale-sensitive operations. */ @Override public void setLocale(Locale locale) { this.locale = locale; } /** * Returns current locale, allowing access by subclasses. */ @Override public Locale getLocale() { return locale; } @Override public void setTimeZone(TimeZone timeZone) { this.timeZone = timeZone; } @Override public TimeZone getTimeZone() { return timeZone; } /** * Sets the default date resolution used by RangeQueries for fields for which no * specific date resolutions has been set. Field specific resolutions can be set * with {@link #setDateResolution(String, org.apache.lucene.document.DateTools.Resolution)}. * * @param dateResolution the default date resolution to set */ @Override public void setDateResolution(DateTools.Resolution dateResolution) { this.dateResolution = dateResolution; } /** * Sets the date resolution used by RangeQueries for a specific field. * * @param fieldName field for which the date resolution is to be set * @param dateResolution date resolution to set */ public void setDateResolution(String fieldName, DateTools.Resolution dateResolution) { if (fieldName == null) { throw new IllegalArgumentException("Field cannot be null."); } if (fieldToDateResolution == null) { // lazily initialize HashMap fieldToDateResolution = new HashMap<String, DateTools.Resolution>(); } fieldToDateResolution.put(fieldName, dateResolution); } /** * Returns the date resolution that is used by RangeQueries for the given field. * Returns null, if no default or field specific date resolution has been set * for the given field. * */ public DateTools.Resolution getDateResolution(String fieldName) { if (fieldName == null) { throw new IllegalArgumentException("Field cannot be null."); } if (fieldToDateResolution == null) { // no field specific date resolutions set; return default date resolution instead return this.dateResolution; } DateTools.Resolution resolution = fieldToDateResolution.get(fieldName); if (resolution == null) { // no date resolutions set for the given field; return default date resolution instead resolution = this.dateResolution; } return resolution; } /** * Set whether or not to analyze range terms when constructing {@link TermRangeQuery}s. * For example, setting this to true can enable analyzing terms into * collation keys for locale-sensitive {@link TermRangeQuery}. * * @param analyzeRangeTerms whether or not terms should be analyzed for RangeQuerys */ public void setAnalyzeRangeTerms(boolean analyzeRangeTerms) { this.analyzeRangeTerms = analyzeRangeTerms; } /** * @return whether or not to analyze range terms when constructing {@link TermRangeQuery}s. */ public boolean getAnalyzeRangeTerms() { return analyzeRangeTerms; } // BL: BooleanClause -> TPBooleanClause protected void addClause(List<TPBooleanClause> clauses, int conj, int mods, TextPattern q) { boolean required, prohibited; // If this term is introduced by AND, make the preceding term required, // unless it's already prohibited if (clauses.size() > 0 && conj == CONJ_AND) { TPBooleanClause c = clauses.get(clauses.size() - 1); if (!c.isProhibited()) c.setOccur(BooleanClause.Occur.MUST); } if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) { // If this term is introduced by OR, make the preceding term optional, // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b) // notice if the input is a OR b, first term is parsed as required; without // this modification a OR b would parsed as +a OR b TPBooleanClause c = clauses.get(clauses.size() - 1); if (!c.isProhibited()) c.setOccur(BooleanClause.Occur.SHOULD); } // We might have been passed a null query; the term might have been // filtered away by the analyzer. if (q == null) return; if (operator == OR_OPERATOR) { // We set REQUIRED if we're introduced by AND or +; PROHIBITED if // introduced by NOT or -; make sure not to set both. prohibited = (mods == MOD_NOT); required = (mods == MOD_REQ); if (conj == CONJ_AND && !prohibited) { required = true; } } else { // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED // if not PROHIBITED and not introduced by OR prohibited = (mods == MOD_NOT); required = (!prohibited && conj != CONJ_OR); } if (required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST)); else if (!required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.SHOULD)); else if (!required && prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST_NOT)); else throw new RuntimeException("Clause cannot be both required and prohibited"); } /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { return newFieldQuery(analyzer, field, queryText, quoted); } /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.tokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text"); p.initCause(e); throw p; } CachingTokenFilter buffer = new CachingTokenFilter(source); TermToBytesRefAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; buffer.reset(); if (buffer.hasAttribute(TermToBytesRefAttribute.class)) { termAtt = buffer.getAttribute(TermToBytesRefAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { ParseException p = new ParseException("Cannot close TokenStream analyzing query text"); p.initCause(e); throw p; } BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef(); if (numTokens == 0) return null; else if (numTokens == 1) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); } else { if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) { if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) { // no phrase query: TextPatternBoolean q = newBooleanQuery(positionCount == 1); // BL: BooleanQuery -> TextPatternBoolean BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; for (int i = 0; i < numTokens; i++) { try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } TextPattern currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))); q.add(currentQuery, occur); } return q; } else { // phrase query: TPMultiPhrase mpq = newMultiPhraseQuery(); // BL: MultiPhraseQuery -> TPMultiPhrase mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes))); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { TPPhrase pq = newPhraseQuery(); // BL: PhraseQuery -> TPPhrase pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; termAtt.fillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, BytesRef.deepCopyOf(bytes)), position); } else { pq.add(new Term(field, BytesRef.deepCopyOf(bytes))); } } return pq; } } } /** * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}. * This method may be overridden, for example, to return * a SpanNearQuery instead of a PhraseQuery. * * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getFieldQuery(String field, String queryText, int slop) throws ParseException { TextPattern query = getFieldQuery(field, queryText, true); if (query instanceof TPPhrase) { // BL was: PhraseQuery ((TPPhrase) query).setSlop(slop); } if (query instanceof TPMultiPhrase) { // BL was: MultiPhraseQuery ((TPMultiPhrase) query).setSlop(slop); } return query; } protected TextPattern getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException { if (lowercaseExpandedTerms) { part1 = part1 == null ? null : part1.toLowerCase(locale); part2 = part2 == null ? null : part2.toLowerCase(locale); } DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale); df.setLenient(true); DateTools.Resolution resolution = getDateResolution(field); try { part1 = DateTools.dateToString(df.parse(part1), resolution); } catch (Exception e) { } try { Date d2 = df.parse(part2); if (endInclusive) { // The user can only specify the date, not the time, so make sure // the time is set to the latest possible time of that date to really // include all documents: Calendar cal = Calendar.getInstance(timeZone, locale); cal.setTime(d2); cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); cal.set(Calendar.MILLISECOND, 999); d2 = cal.getTime(); } part2 = DateTools.dateToString(d2, resolution); } catch (Exception e) { } return newRangeQuery(field, part1, part2, startInclusive, endInclusive); } /** * Builds a new BooleanQuery instance * @param disableCoord disable coord * @return new BooleanQuery instance */ protected TextPatternBoolean newBooleanQuery(boolean disableCoord) { return new TextPatternBoolean(disableCoord); // BL was: BooleanQuery } /** * Builds a new BooleanClause instance * @param q sub query * @param occur how this clause should occur when matching documents * @return new BooleanClause instance */ protected TPBooleanClause newBooleanClause(TextPattern q, BooleanClause.Occur occur) { return new TPBooleanClause(q, occur); // BL was: BooleanClause } /** * Builds a new TermQuery instance * @param term term * @return new TermQuery instance */ protected TextPattern newTermQuery(Term term) { return new TextPatternTerm(term.text()); // BL was: TermQuery } /** * Builds a new PhraseQuery instance * @return new PhraseQuery instance */ protected TPPhrase newPhraseQuery() { return new TPPhrase(); // BL was: PhraseQuery } /** * Builds a new MultiPhraseQuery instance * @return new MultiPhraseQuery instance */ protected TPMultiPhrase newMultiPhraseQuery() { // BL: MultiPhraseQuery -> TPMultiPhrase return new TPMultiPhrase(); // BL was: MultiPhraseQuery } /** * Builds a new PrefixQuery instance * @param prefix Prefix term * @return new PrefixQuery instance */ protected TextPattern newPrefixQuery(Term prefix) { TextPattern query = new TextPatternPrefix(prefix.text()); // BL was: PrefixQuery //query.setRewriteMethod(multiTermRewriteMethod); // BL: disabled return query; } /** * Builds a new RegexpQuery instance * @param regexp Regexp term * @return new RegexpQuery instance */ protected TextPattern newRegexpQuery(Term regexp) { TextPattern query = new TextPatternRegex(regexp.text()); // BL was: RegexpQuery //query.setRewriteMethod(multiTermRewriteMethod); // BL: disabled return query; } /** * Builds a new FuzzyQuery instance * @param term Term * @param minimumSimilarity minimum similarity * @param prefixLength prefix length * @return new FuzzyQuery Instance */ protected TextPattern newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { // FuzzyQuery doesn't yet allow constant score rewrite String text = term.text(); int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity, text.codePointCount(0, text.length())); return new TextPatternFuzzy(term.text(), numEdits, prefixLength); // BL was: FuzzyQuery } // TODO: Should this be protected instead? private BytesRef analyzeMultitermTerm(String field, String part) { return analyzeMultitermTerm(field, part, analyzer); } protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { TokenStream source; if (analyzerIn == null) analyzerIn = analyzer; try { source = analyzerIn.tokenStream(field, new StringReader(part)); source.reset(); } catch (IOException e) { throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e); } TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); try { if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); } catch (IOException e) { throw new RuntimeException("error analyzing range part: " + part, e); } try { source.end(); source.close(); } catch (IOException e) { throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e); } return BytesRef.deepCopyOf(bytes); } /** * Builds a new {@link TermRangeQuery} instance * @param field Field * @param part1 min * @param part2 max * @param startInclusive true if the start of the range is inclusive * @param endInclusive true if the end of the range is inclusive * @return new {@link TermRangeQuery} instance */ protected TextPattern newRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) { final BytesRef start; final BytesRef end; if (part1 == null) { start = null; } else { start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1); } if (part2 == null) { end = null; } else { end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2); } // BL was: TermRangeQuery final TPTermRange query = new TPTermRange(field, start.utf8ToString(), end.utf8ToString(), startInclusive, endInclusive); //query.setRewriteMethod(multiTermRewriteMethod); // BL disabled return query; } /** * Builds a new MatchAllDocsQuery instance * @return new MatchAllDocsQuery instance */ protected TextPattern newMatchAllDocsQuery() { return new TPMatchAllDocs(); // BL was: MatchAllDocsQuery } /** * Builds a new WildcardQuery instance * @param t wildcard term * @return new WildcardQuery instance */ protected TextPattern newWildcardQuery(Term t) { TextPattern query = new TextPatternWildcard(t.text()); // BL was: WildcardQuery //query.setRewriteMethod(multiTermRewriteMethod); // BL disabled return query; } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances * to join. * * @return Resulting {@link org.apache.lucene.search.Query} object. * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getBooleanQuery(List<TPBooleanClause> clauses) throws ParseException { // BL was: BooleanClause return getBooleanQuery(clauses, false); } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances * to join. * @param disableCoord true if coord scoring should be disabled. * * @return Resulting {@link org.apache.lucene.search.Query} object. * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getBooleanQuery(List<TPBooleanClause> clauses, boolean disableCoord) // BL was: BooleanClause throws ParseException { if (clauses.size() == 0) { return null; // all clause words were filtered away by the analyzer. } TextPatternBoolean query = newBooleanQuery(disableCoord); // BL was: BooleanQuery for (final TPBooleanClause clause : clauses) { // BL was: BooleanClause query.add(clause); } return query; } /** * Factory method for generating a query. Called when parser * parses an input term token that contains one or more wildcard * characters (? and *), but is not a prefix term token (one * that has just a single * character at the end) *<p> * Depending on settings, prefix term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with wildcard templates. *<p> * Can be overridden by extending classes, to provide custom handling for * wildcard queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token that contains one or more wild card * characters (? or *), but is not simple prefix term * * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getWildcardQuery(String field, String termStr) throws ParseException { if ("*".equals(field)) { if ("*".equals(termStr)) return newMatchAllDocsQuery(); } if (!allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?"))) throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(locale); } Term t = new Term(field, termStr); return newWildcardQuery(t); } /** * Factory method for generating a query. Called when parser * parses an input term token that contains a regular expression * query. *<p> * Depending on settings, pattern term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with regular expression templates. *<p> * Can be overridden by extending classes, to provide custom handling for * regular expression queries, which may be necessary due to missing analyzer * calls. * * @param field Name of the field query will use. * @param termStr Term token that contains a regular expression * * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getRegexpQuery(String field, String termStr) throws ParseException { if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(locale); } Term t = new Term(field, termStr); return newRegexpQuery(t); } /** * Factory method for generating a query (similar to * {@link #getWildcardQuery}). Called when parser parses an input term * token that uses prefix notation; that is, contains a single '*' wildcard * character as its last character. Since this is a special case * of generic wildcard term, and such a query can be optimized easily, * this usually results in a different query object. *<p> * Depending on settings, a prefix term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with wildcard templates. *<p> * Can be overridden by extending classes, to provide custom handling for * wild card queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * (<b>without</b> trailing '*' character!) * * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getPrefixQuery(String field, String termStr) throws ParseException { if (!allowLeadingWildcard && termStr.startsWith("*")) throw new ParseException("'*' not allowed as first character in PrefixQuery"); if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(locale); } Term t = new Term(field, termStr); return newPrefixQuery(t); } /** * Factory method for generating a query (similar to * {@link #getWildcardQuery}). Called when parser parses * an input term token that has the fuzzy suffix (~) appended. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ protected TextPattern getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(locale); } Term t = new Term(field, termStr); return newFuzzyQuery(t, minSimilarity, fuzzyPrefixLength); } // extracted from the .jj grammar TextPattern handleBareTokenQuery(String qfield, Token term, Token fuzzySlop, boolean prefix, boolean wildcard, boolean fuzzy, boolean regexp) throws ParseException { TextPattern q; String termImage = discardEscapeChar(term.image); if (wildcard) { q = getWildcardQuery(qfield, term.image); } else if (prefix) { q = getPrefixQuery(qfield, discardEscapeChar(term.image.substring(0, term.image.length() - 1))); } else if (regexp) { q = getRegexpQuery(qfield, term.image.substring(1, term.image.length() - 1)); } else if (fuzzy) { q = handleBareFuzzy(qfield, fuzzySlop, termImage); } else { q = getFieldQuery(qfield, termImage, false); } return q; } TextPattern handleBareFuzzy(String qfield, Token fuzzySlop, String termImage) throws ParseException { TextPattern q; float fms = fuzzyMinSim; try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } if (fms < 0.0f) { throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); } else if (fms >= 1.0f && fms != (int) fms) { throw new ParseException("Fractional edit distances are not allowed!"); } q = getFuzzyQuery(qfield, termImage, fms); return q; } // extracted from the .jj grammar TextPattern handleQuotedTerm(String qfield, Token term, Token fuzzySlop) throws ParseException { int s = phraseSlop; // default if (fuzzySlop != null) { try { s = Float.valueOf(fuzzySlop.image.substring(1)).intValue(); } catch (Exception ignored) { } } return getFieldQuery(qfield, discardEscapeChar(term.image.substring(1, term.image.length() - 1)), s); } // extracted from the .jj grammar TextPattern handleBoost(TextPattern q, Token boost) { /* if (boost != null) { float f = (float) 1.0; try { f = Float.valueOf(boost.image).floatValue(); } catch (Exception ignored) { // Should this be handled somehow? (defaults to "no boost", if // boost number is invalid) } // avoid boosting null queries, such as those caused by stop words if (q != null) { q.setBoost(f); } } */ return q; // BL: boost not supported } /** * Returns a String where the escape char has been * removed, or kept only once if there was a double escape. * * Supports escaped unicode characters, e. g. translates * <code>\\u0041</code> to <code>A</code>. * */ String discardEscapeChar(String input) throws ParseException { // Create char array to hold unescaped char sequence char[] output = new char[input.length()]; // The length of the output can be less than the input // due to discarded escape chars. This variable holds // the actual length of the output int length = 0; // We remember whether the last processed character was // an escape character boolean lastCharWasEscapeChar = false; // The multiplier the current unicode digit must be multiplied with. // E. g. the first digit must be multiplied with 16^3, the second with 16^2... int codePointMultiplier = 0; // Used to calculate the codepoint of the escaped unicode character int codePoint = 0; for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); if (codePointMultiplier > 0) { codePoint += hexToInt(curChar) * codePointMultiplier; codePointMultiplier >>>= 4; if (codePointMultiplier == 0) { output[length++] = (char) codePoint; codePoint = 0; } } else if (lastCharWasEscapeChar) { if (curChar == 'u') { // found an escaped unicode character codePointMultiplier = 16 * 16 * 16; } else { // this character was escaped output[length] = curChar; length++; } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { lastCharWasEscapeChar = true; } else { output[length] = curChar; length++; } } } if (codePointMultiplier > 0) { throw new ParseException("Truncated unicode escape sequence."); } if (lastCharWasEscapeChar) { throw new ParseException("Term can not end with escape character."); } return new String(output, 0, length); } /** Returns the numeric value of the hexadecimal character */ static final int hexToInt(char c) throws ParseException { if ('0' <= c && c <= '9') { return c - '0'; } else if ('a' <= c && c <= 'f') { return c - 'a' + 10; } else if ('A' <= c && c <= 'F') { return c - 'A' + 10; } else { throw new ParseException("Non-hex character in Unicode escape sequence: " + c); } } /** * Returns a String where those characters that QueryParser * expects to be escaped are escaped by a preceding <code>\</code>. */ public static String escape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); // These characters are part of the query syntax and must be escaped if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/') { sb.append('\\'); } sb.append(c); } return sb.toString(); } }