Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This parser was originally derived from DismaxQParser from Solr. * All changes are Copyright 2008, Lucid Imagination, Inc. */ package org.apache.solr.search; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.solr.analysis.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.analysis.TokenizerFactory; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.DefaultSolrParams; import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.search.function.BoostedQuery; import org.apache.solr.util.SolrPluginUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; /** * An advanced multi-field query parser. * * @lucene.experimental */ public class SynonymExpandingExtendedDismaxQParserPlugin extends ExtendedDismaxQParserPlugin implements ResourceLoaderAware { public static final String NAME = "synonym_edismax"; private NamedList<?> args; private Map<String, Analyzer> synonymAnalyzers; @SuppressWarnings("rawtypes") public void init(NamedList args) { this.args = (NamedList<?>) args; } @Override public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { return new SynonymExpandingExtendedDismaxQParser(qstr, localParams, params, req, synonymAnalyzers); } private Map<String, String> convertNamedListToMap(NamedList<?> namedList) { Map<String, String> result = new HashMap<String, String>(); for (Entry<String, ?> entry : namedList) { if (entry.getValue() instanceof String) { result.put(entry.getKey(), (String) entry.getValue()); } } return result; } public void inform(ResourceLoader loader) { // TODO it would be nice if the user didn't have to encode tokenizers/filters // as a NamedList. But for now this is the hack I'm using synonymAnalyzers = new HashMap<String, Analyzer>(); Object luceneMatchVersion = args.get("luceneMatchVersion"); if (luceneMatchVersion == null || !(luceneMatchVersion instanceof String)) { throw new SolrException(ErrorCode.SERVER_ERROR, "luceneMatchVersion must be defined for the synonym_edismax parser"); } Object xmlSynonymAnalyzers = args.get("synonymAnalyzers"); if (xmlSynonymAnalyzers != null && xmlSynonymAnalyzers instanceof NamedList) { NamedList<?> synonymAnalyzersList = (NamedList<?>) xmlSynonymAnalyzers; for (Entry<String, ?> entry : synonymAnalyzersList) { String analyzerName = entry.getKey(); if (!(entry.getValue() instanceof NamedList)) { continue; } NamedList<?> analyzerAsNamedList = (NamedList<?>) entry.getValue(); TokenizerFactory tokenizerFactory = null; List<TokenFilterFactory> filterFactories = new LinkedList<TokenFilterFactory>(); for (Entry<String, ?> analyzerEntry : analyzerAsNamedList) { String key = analyzerEntry.getKey(); if (!(entry.getValue() instanceof NamedList)) { continue; } Map<String, String> params = convertNamedListToMap((NamedList<?>) analyzerEntry.getValue()); // add the lucene match version because it's usually required params.put("luceneMatchVersion", (String) luceneMatchVersion); if (!params.containsKey("class")) { continue; } String className = params.get("class"); if (key.equals("tokenizer")) { tokenizerFactory = (TokenizerFactory) loader.newInstance(className); tokenizerFactory.init(params); if (tokenizerFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokenizerFactory).inform(loader); } } else if (key.equals("filter")) { TokenFilterFactory filterFactory = (TokenFilterFactory) loader.newInstance(className); filterFactory.init(params); if (filterFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) filterFactory).inform(loader); } filterFactories.add(filterFactory); } } if (tokenizerFactory == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "tokenizer must not be null for synonym analyzer: " + analyzerName); } else if (filterFactories.isEmpty()) { throw new SolrException(ErrorCode.SERVER_ERROR, "filter factories must be defined for synonym analyzer: " + analyzerName); } TokenizerChain analyzer = new TokenizerChain(tokenizerFactory, filterFactories.toArray(new TokenFilterFactory[filterFactories.size()])); synonymAnalyzers.put(analyzerName, analyzer); } } } } class SynonymExpandingExtendedDismaxQParser extends ExtendedDismaxQParser { /** * Convenience class for parameters */ public static class Params { public static final String SYNONYMS = "synonyms"; public static final String SYNONYMS_ANALYZER = "synonyms.analyzer"; public static final String SYNONYMS_ORIGINAL_BOOST = "synonyms.originalBoost"; public static final String SYNONYMS_SYNONYM_BOOST = "synonyms.synonymBoost"; public static final String SYNONYMS_DISABLE_PHRASE_QUERIES = "synonyms.disablePhraseQueries"; } /** * Convenience class for calling constants. * @author nolan * */ private static class Const { /** * A field we can't ever find in any schema, so we can safely tell * DisjunctionMaxQueryParser to use it as our defaultField, and map aliases * from it to any field in our schema. */ static final String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC"; static final Pattern COMPLEX_QUERY_OPERATORS_PATTERN = Pattern.compile("(?:\\*|\\b(?:OR|AND|-|\\+)\\b)"); } /** shorten the class references for utilities */ private static class U extends SolrPluginUtils { /* :NOOP */ } private Map<String, Analyzer> synonymAnalyzers; private Query queryToHighlight; public SynonymExpandingExtendedDismaxQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req, Map<String, Analyzer> synonymAnalyzers) { super(qstr, localParams, params, req); this.synonymAnalyzers = synonymAnalyzers; } @Override public Query getHighlightQuery() throws ParseException { return queryToHighlight != null ? queryToHighlight : super.getHighlightQuery(); } @Override public Query parse() throws ParseException { Query query = super.parse(); SolrParams localParams = getLocalParams(); SolrParams params = getParams(); SolrParams solrParams = localParams == null ? params : new DefaultSolrParams(localParams, params); // disable/enable synonym handling altogether if (!solrParams.getBool(Params.SYNONYMS, false)) { return query; } // check to make sure the analyzer exists String analyzerName = solrParams.get(Params.SYNONYMS_ANALYZER, null); if (analyzerName == null) { // no synonym analyzer specified if (synonymAnalyzers.size() == 1) { // only one analyzer defined; just use that one analyzerName = synonymAnalyzers.keySet().iterator().next(); } else { return query; } } Analyzer synonymAnalyzer = synonymAnalyzers.get(analyzerName); if (synonymAnalyzer == null) { // couldn't find analyzer return query; } if (solrParams.getBool(Params.SYNONYMS_DISABLE_PHRASE_QUERIES, false) && getString().indexOf('"') != -1) { // disable if a phrase query is detected, i.e. there's a '"' return query; } attemptToApplySynonymsToQuery(query, solrParams, synonymAnalyzer); return query; } private void attemptToApplySynonymsToQuery(Query query, SolrParams solrParams, Analyzer synonymAnalyzer) { List<Query> synonymQueries = generateSynonymQueries(synonymAnalyzer, solrParams); boolean hasComplexQueryOperators = Const.COMPLEX_QUERY_OPERATORS_PATTERN.matcher(getString()).find(); if (hasComplexQueryOperators // TODO: support complex operators || synonymQueries.isEmpty()) { // didn't find more than 0 synonyms, i.e. it's just the original phrase return; } // TODO: EDisMax does not do minShouldMatch if complex query operators exist, and neither do we. // But in the future we might, so keep doMinShouldMatch separate for now boolean doMinShouldMatch = true; String minShouldMatch = solrParams.get(DisMaxParams.MM, "100%"); float originalBoost = solrParams.getFloat(Params.SYNONYMS_ORIGINAL_BOOST, 1.0F); float synonymBoost = solrParams.getFloat(Params.SYNONYMS_SYNONYM_BOOST, 1.0F); applySynonymQueries(query, synonymQueries, originalBoost, synonymBoost, doMinShouldMatch, minShouldMatch); } /** * Find the main query and its surrounding clause, make it SHOULD instead of MUST and append a bunch * of other SHOULDs to it, then wrap it in a MUST * * E.g. +(text:dog) becomes * +((text:dog)^1.5 ((text:hound) (text:pooch))^1.2) * @param query * @param synonymQueries * @param originalBoost * @param synonymBoost * @param doMinShouldMatch * @param minShouldMatch */ private void applySynonymQueries(Query query, List<Query> synonymQueries, float originalBoost, float synonymBoost, boolean doMinShouldMatch, String minShouldMatch) { if (query instanceof BoostedQuery) { applySynonymQueries(((BoostedQuery) query).getQuery(), synonymQueries, originalBoost, synonymBoost, doMinShouldMatch, minShouldMatch); } else if (query instanceof BooleanQuery) { BooleanQuery booleanQuery = (BooleanQuery) query; for (BooleanClause booleanClause : booleanQuery.getClauses()) { if (Occur.MUST == booleanClause.getOccur()) { // standard 'must occur' clause - i.e. the main user query Query mainUserQuery = booleanClause.getQuery(); mainUserQuery.setBoost(originalBoost); // combine all synonym queries together with the same boost BooleanQuery allSynonymQueries = new BooleanQuery(); for (Query synonymQuery : synonymQueries) { if (doMinShouldMatch && synonymQuery instanceof BooleanQuery) { U.setMinShouldMatch((BooleanQuery) synonymQuery, minShouldMatch); } allSynonymQueries.add(synonymQuery, Occur.SHOULD); } allSynonymQueries.setBoost(synonymBoost); // now combine with the original main user query BooleanQuery combinedQuery = new BooleanQuery(); combinedQuery.add(mainUserQuery, Occur.SHOULD); combinedQuery.add(allSynonymQueries, Occur.SHOULD); booleanClause.setQuery(combinedQuery); queryToHighlight = combinedQuery; } } } } /** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(getString())); SortedMap<Integer, SortedSet<TextInQuery>> startPosToTextsInQuery = new TreeMap<Integer, SortedSet<TextInQuery>>(); try { while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well TextInQuery textInQuery = new TextInQuery(term.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()); // brain-dead multimap logic... man, I wish we had Google Guava here SortedSet<TextInQuery> existingList = startPosToTextsInQuery.get(offsetAttribute.startOffset()); if (existingList == null) { existingList = new TreeSet<TextInQuery>(); startPosToTextsInQuery.put(offsetAttribute.startOffset(), existingList); } existingList.add(textInQuery); } } } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<List<TextInQuery>>( startPosToTextsInQuery.values().size()); for (SortedSet<TextInQuery> sortedSet : startPosToTextsInQuery.values()) { sortedTextsInQuery.add(new ArrayList<TextInQuery>(sortedSet)); } // have to use the start positions and end positions to figure out all possible combinations List<String> alternateQueries = buildUpAlternateQueries(sortedTextsInQuery); return createSynonymQueries(solrParams, alternateQueries); } /** * From a list of texts in the original query that were deemed to be interested (i.e. synonyms or the original text * itself), build up all possible alternate queries as strings. * * For instance, if the query is "dog bite" and the synonyms are dog -> [dog,hound,pooch] and bite -> [bite,nibble], * then the result will be: * * dog bite * hound bite * pooch bite * dog nibble * hound nibble * pooch nibble * * @param textsInQueryLists * @return */ private List<String> buildUpAlternateQueries(List<List<TextInQuery>> textsInQueryLists) { String originalUserQuery = getString(); if (textsInQueryLists.isEmpty()) { return Collections.emptyList(); } // initialize results List<AlternateQuery> alternateQueries = new ArrayList<AlternateQuery>(); for (TextInQuery textInQuery : textsInQueryLists.get(0)) { // add the text before the first user query token, e.g. a space or a " StringBuilder stringBuilder = new StringBuilder( originalUserQuery.subSequence(0, textInQuery.getStartPosition())).append(textInQuery.getText()); alternateQueries.add(new AlternateQuery(stringBuilder, textInQuery.getEndPosition())); } for (int i = 1; i < textsInQueryLists.size(); i++) { List<TextInQuery> textsInQuery = textsInQueryLists.get(i); // compute the length in advance, because we'll be adding new ones as we go int alternateQueriesLength = alternateQueries.size(); for (int j = 0; j < alternateQueriesLength; j++) { AlternateQuery alternateQuery = alternateQueries.get(j); boolean usedFirst = false; for (int k = 0; k < textsInQuery.size(); k++) { TextInQuery textInQuery = textsInQuery.get(k); if (alternateQuery.getEndPosition() > textInQuery.getStartPosition()) { // cannot be appended break; // already in order, so we can safely break } if (!usedFirst) { // re-use the existing object usedFirst = true; } else { // need to clone to a new object alternateQuery = (AlternateQuery) alternateQuery.clone(); alternateQueries.add(alternateQuery); } // text in the original query between the two tokens, usually a space CharSequence betweenTokens = originalUserQuery.subSequence(alternateQuery.getEndPosition(), textInQuery.getStartPosition()); alternateQuery.getStringBuilder().append(betweenTokens).append(textInQuery.getText()); alternateQuery.setEndPosition(textInQuery.getEndPosition()); } } } List<String> result = new ArrayList<String>(); for (AlternateQuery alternateQuery : alternateQueries) { // append whatever text followed the last token, e.g. '"' alternateQuery.getStringBuilder().append( originalUserQuery.subSequence(alternateQuery.getEndPosition(), originalUserQuery.length())); result.add(alternateQuery.getStringBuilder().toString()); } return result; } /** * From a list of alternate queries in text format, parse them using the default * ExtendedSolrQueryParser and return the queries. * * @param solrParams * @param alternateQueryTexts * @return */ private List<Query> createSynonymQueries(SolrParams solrParams, List<String> alternateQueryTexts) { // // begin copied code from ExtendedDismaxQParser // // have to build up the queryFields again because in Solr 3.6.1 they made it private. Map<String, Float> queryFields = SolrPluginUtils.parseFieldBoosts(solrParams.getParams(DisMaxParams.QF)); if (0 == queryFields.size()) { queryFields.put(req.getSchema().getDefaultSearchFieldName(), 1.0f); } float tiebreaker = solrParams.getFloat(DisMaxParams.TIE, 0.0f); int qslop = solrParams.getInt(DisMaxParams.QS, 0); ExtendedSolrQueryParser up = new ExtendedSolrQueryParser(this, Const.IMPOSSIBLE_FIELD_NAME); up.addAlias(Const.IMPOSSIBLE_FIELD_NAME, tiebreaker, queryFields); up.setPhraseSlop(qslop); // slop for explicit user phrase queries up.setAllowLeadingWildcard(true); // // end copied code // List<Query> result = new ArrayList<Query>(); for (String alternateQueryText : alternateQueryTexts) { if (alternateQueryText.equals(getString())) { // alternate query is the same as what the user entered continue; } try { result.add(up.parse(alternateQueryText)); } catch (ParseException e) { // TODO: better error handling - for now just bail out; ignore this synonym e.printStackTrace(System.err); } } return result; } /** * Simple POJO for representing a piece of text found in the original query or expanded using shingles/synonyms. * @author nolan * */ private static class TextInQuery implements Comparable<TextInQuery> { private String text; private int endPosition; private int startPosition; public TextInQuery(String text, int startPosition, int endPosition) { this.text = text; this.startPosition = startPosition; this.endPosition = endPosition; } public String getText() { return text; } public int getEndPosition() { return endPosition; } public int getStartPosition() { return startPosition; } @Override public String toString() { return "TextInQuery [text=" + text + ", endPosition=" + endPosition + ", startPosition=" + startPosition + "]"; } public int compareTo(TextInQuery other) { if (this.startPosition != other.startPosition) { return this.startPosition - other.startPosition; } else if (this.endPosition != other.endPosition) { return this.endPosition - other.endPosition; } return this.text.compareTo(other.text); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + endPosition; result = prime * result + startPosition; result = prime * result + ((text == null) ? 0 : text.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; TextInQuery other = (TextInQuery) obj; if (endPosition != other.endPosition) return false; if (startPosition != other.startPosition) return false; if (text == null) { if (other.text != null) return false; } else if (!text.equals(other.text)) return false; return true; } } /** * Simple POJO for containing an alternate query that we're building up * @author nolan * */ private static class AlternateQuery implements Cloneable { private StringBuilder stringBuilder; private int endPosition; public AlternateQuery(StringBuilder stringBuilder, int endPosition) { this.stringBuilder = stringBuilder; this.endPosition = endPosition; } public StringBuilder getStringBuilder() { return stringBuilder; } public int getEndPosition() { return endPosition; } public void setEndPosition(int endPosition) { this.endPosition = endPosition; } public Object clone() { return new AlternateQuery(new StringBuilder(stringBuilder), endPosition); } @Override public String toString() { return "AlternateQuery [stringBuilder=" + stringBuilder + ", endPosition=" + endPosition + "]"; } } }