Java tutorial
/* * Hibernate Search, full-text search for your domain model * * License: GNU Lesser General Public License (LGPL), version 2.1 or later * See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>. */ package org.hibernate.search.query.dsl.impl; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queries.mlt.MoreLikeThis; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.UnicodeUtil; import org.hibernate.search.analyzer.impl.LuceneAnalyzerReference; import org.hibernate.search.annotations.Store; import org.hibernate.search.bridge.FieldBridge; import org.hibernate.search.bridge.builtin.NumericFieldBridge; import org.hibernate.search.bridge.util.impl.ContextualExceptionBridgeHelper; import org.hibernate.search.engine.impl.DocumentBuilderHelper; import org.hibernate.search.engine.integration.impl.ExtendedSearchIntegrator; import org.hibernate.search.engine.metadata.impl.DocumentFieldMetadata; import org.hibernate.search.engine.spi.DocumentBuilderIndexedEntity; import org.hibernate.search.exception.AssertionFailure; import org.hibernate.search.query.engine.spi.EntityInfo; import org.hibernate.search.query.engine.spi.HSQuery; import org.hibernate.search.util.impl.PassThroughAnalyzer; import org.hibernate.search.util.logging.impl.Log; import org.hibernate.search.util.logging.impl.LoggerFactory; import static org.hibernate.search.query.dsl.impl.ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE.ENTITY; import static org.hibernate.search.query.dsl.impl.ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE.ID; /** * Class inspired and code copied from Apache Lucene MoreLikeThis class. * Apache Lucene code copyright the Apache Software Foundation released under the * Apache Software License 2.0. * * @author Emmanuel Bernard */ public class MoreLikeThisBuilder<T> { private static final Log log = LoggerFactory.make(); private final int minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH; private final int maxNumTokensParsed = MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED; private final int maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH; private final Set<?> stopWords = MoreLikeThis.DEFAULT_STOP_WORDS; private final DocumentBuilderIndexedEntity documentBuilder; // We lower the min defaults to 1 because we don't merge the freq of *all* fields unlike the original MoreLikeThis // TODO: is that hurting performance? Could we guess "small fields" and ony lower these? private final int minTermFreq = 1; //MoreLikeThis.DEFAULT_MIN_TERM_FREQ; private final int minDocFreq = 1; //MoreLikeThis.DEFAULT_MIN_DOC_FREQ; private final int maxDocFreq = MoreLikeThis.DEFAULT_MAX_DOC_FREQ; private final int maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS; private boolean boost = MoreLikeThis.DEFAULT_BOOST; private float boostFactor = 1; private TFIDFSimilarity similarity; private Integer documentNumber; private String[] compatibleFieldNames; private IndexReader indexReader; private FieldsContext fieldsContext; private Object input; private QueryBuildingContext queryContext; private boolean excludeEntityCompared; private ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE inputType; private TermQuery findById; public MoreLikeThisBuilder(DocumentBuilderIndexedEntity documentBuilder, ExtendedSearchIntegrator searchIntegrator) { this.documentBuilder = documentBuilder; Similarity configuredSimilarity = searchIntegrator.getIndexBindings().get(documentBuilder.getBeanClass()) .getSimilarity(); if (configuredSimilarity instanceof TFIDFSimilarity) { this.similarity = (TFIDFSimilarity) configuredSimilarity; } else { throw log.requireTFIDFSimilarity(documentBuilder.getBeanClass()); } } public MoreLikeThisBuilder indexReader(IndexReader indexReader) { this.indexReader = indexReader; return this; } public MoreLikeThisBuilder compatibleFieldNames(String... compatibleFieldNames) { this.compatibleFieldNames = compatibleFieldNames; return this; } public MoreLikeThisBuilder otherMoreLikeThisContext(MoreLikeThisQueryContext moreLikeThisContext) { this.boost = moreLikeThisContext.isBoostTerms(); this.boostFactor = moreLikeThisContext.getTermBoostFactor(); this.excludeEntityCompared = moreLikeThisContext.isExcludeEntityUsedForComparison(); return this; } /** * Return a query that will return docs like the passed lucene document ID. * @return a query that will return docs like the passed lucene document ID. */ public Query createQuery() { try { documentNumber = getLuceneDocumentIdFromIdAsTermOrNull(documentBuilder); return maybeExcludeComparedEntity(createQuery(retrieveTerms())); } catch (IOException e) { throw log.ioExceptionOnIndexOfEntity(e, documentBuilder.getBeanClass()); } } /** * Try and retrieve the document id from the input. If failing and a backup approach exists, returns null. */ private Integer getLuceneDocumentIdFromIdAsTermOrNull(DocumentBuilderIndexedEntity documentBuilder) { String id; if (inputType == ID) { id = documentBuilder.getIdBridge().objectToString(input); } else if (inputType == ENTITY) { // Try and extract the id, if failing the id will be null try { // I expect a two way bridge to return null from a null input, correct? id = documentBuilder.getIdBridge().objectToString(documentBuilder.getId(input)); } catch (IllegalStateException e) { id = null; } } else { throw new AssertionFailure("We don't support string and reader for MoreLikeThis"); } if (id == null) { return null; } findById = new TermQuery(new Term(documentBuilder.getIdFieldName(), id)); HSQuery query = queryContext.getFactory().createHSQuery(findById, queryContext.getEntityType()); List<EntityInfo> entityInfos = query.maxResults(1).projection(HSQuery.DOCUMENT_ID).queryEntityInfos(); if (entityInfos.size() == 0) { if (inputType == ID) { throw log.entityWithIdNotFound(queryContext.getEntityType(), id); } else { return null; } } return (Integer) entityInfos.iterator().next().getProjection()[0]; } private Query maybeExcludeComparedEntity(Query query) { // It would be better to attach a collector to exclude a document by its id // but at this stage we could have documents reordered and thus with a different id // Maybe a Filter would be more efficient? if (excludeEntityCompared && documentNumber != null) { return new BooleanQuery.Builder().add(query, BooleanClause.Occur.MUST) .add(new ConstantScoreQuery(findById), BooleanClause.Occur.MUST_NOT).build(); } else { return query; } } /** * Create the More Like This query from a PriorityQueue */ private Query createQuery(List<PriorityQueue<Object[]>> q) { //In the original algorithm, the number of terms is limited to maxQueryTerms //In the current implementation, we do nbrOfFields * maxQueryTerms int length = fieldsContext.size(); if (length == 0) { throw new AssertionFailure("Querying MoreLikeThis on 0 field."); } else if (length == 1) { return createQuery(q.get(0), fieldsContext.getFirst()); } else { BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); //the fieldsContext indexes are aligned with the priority queue's Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator(); for (PriorityQueue<Object[]> queue : q) { try { queryBuilder.add(createQuery(queue, fieldsContextIterator.next()), BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } } return queryBuilder.build(); } } private Query createQuery(PriorityQueue<Object[]> q, FieldContext fieldContext) { if (q == null) { final FieldBridge fieldBridge = fieldContext.getFieldBridge() != null ? fieldContext.getFieldBridge() : documentBuilder.getBridge(fieldContext.getField()); if (fieldBridge instanceof NumericFieldBridge) { // we probably can do something here //TODO how to build the query where we don't have the value? throw log.numericFieldCannotBeUsedInMoreLikeThis(fieldContext.getField(), documentBuilder.getBeanClass()); } DocumentFieldMetadata fieldMetadata = documentBuilder.getTypeMetadata() .getDocumentFieldMetadataFor(fieldContext.getField()); if (fieldMetadata == null) { throw log.unknownFieldNameForMoreLikeThisQuery(fieldContext.getField(), documentBuilder.getBeanClass().getName()); } boolean hasTermVector = fieldMetadata.getTermVector() != Field.TermVector.NO; boolean isStored = fieldMetadata.getStore() != Store.NO; if (!(hasTermVector || isStored)) { throw log.fieldNotStoredNorTermVectorCannotBeUsedInMoreLikeThis(fieldContext.getField(), documentBuilder.getBeanClass()); } boolean isIdOrEmbeddedId = fieldMetadata.isId() || fieldMetadata.isIdInEmbedded(); if (isIdOrEmbeddedId) { throw log.fieldIdCannotBeUsedInMoreLikeThis(fieldContext.getField(), documentBuilder.getBeanClass()); } } BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); Object cur; int qterms = 0; float bestScore = 0; while ((cur = q.pop()) != null) { Object[] ar = (Object[]) cur; TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); if (boost) { if (qterms == 0) { bestScore = ((Float) ar[2]); } float myScore = ((Float) ar[2]); tq.setBoost(boostFactor * myScore / bestScore); } try { queryBuilder.add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } qterms++; if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { break; } } // Apply field adjustments return fieldContext.getFieldCustomizer().setWrappedQuery(queryBuilder.build()).createQuery(); } /** * Find words for a more-like-this query former. * Store them per field name according to the order of fieldnames defined in {@link #fieldsContext}. * If the field name is not compatible with term retrieval, the queue will be empty for that index. */ private List<PriorityQueue<Object[]>> retrieveTerms() throws IOException { int size = fieldsContext.size(); Map<String, Map<String, Int>> termFreqMapPerFieldname = new HashMap<String, Map<String, Int>>(size); final Fields vectors; Document maybeDocument = null; if (documentNumber == null && size > 0) { //build the document from the entity instance //first build the list of fields we are interested in String[] fieldNames = new String[size]; Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator(); for (int index = 0; index < size; index++) { fieldNames[index] = fieldsContextIterator.next().getField(); } //TODO should we keep the fieldToAnalyzerMap around to pass to the analyzer? Map<String, String> fieldToAnalyzerMap = new HashMap<String, String>(); //FIXME by calling documentBuilder we don't honor .comparingField("foo").ignoreFieldBridge(): probably not a problem in practice though maybeDocument = documentBuilder.getDocument(null, input, null, fieldToAnalyzerMap, null, new ContextualExceptionBridgeHelper(), fieldNames); vectors = null; } else { vectors = indexReader.getTermVectors(documentNumber); } for (FieldContext fieldContext : fieldsContext) { String fieldName = fieldContext.getField(); if (isCompatibleField(fieldName)) { Map<String, Int> termFreqMap = new HashMap<String, Int>(); termFreqMapPerFieldname.put(fieldName, termFreqMap); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { if (maybeDocument == null) { maybeDocument = indexReader.document(documentNumber); } IndexableField[] fields = maybeDocument.getFields(fieldName); for (IndexableField field : fields) { //TODO numbers final String stringValue = DocumentBuilderHelper.extractStringFromFieldable(field); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldContext); } } } else { addTermFrequencies(termFreqMap, vector); } } else { //place null as the field is not compatible termFreqMapPerFieldname.put(fieldName, null); } } List<PriorityQueue<Object[]>> results = new ArrayList<PriorityQueue<Object[]>>(size); for (Map.Entry<String, Map<String, Int>> entry : termFreqMapPerFieldname.entrySet()) { results.add(createQueue(entry.getKey(), entry.getValue())); } return results; } private boolean isCompatibleField(String fieldName) { for (String compatibleFieldName : compatibleFieldNames) { if (compatibleFieldName.equals(fieldName)) { return true; } } return false; } /** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. */ private PriorityQueue<Object[]> createQueue(String fieldName, Map<String, Int> words) throws IOException { if (words == null) { //incompatible field name return null; } // have collected all words in doc and their freqs int numDocs = indexReader.numDocs(); FreqQ res = new FreqQ(words.size()); // will order words by score for (Map.Entry<String, Int> entry : words.entrySet()) { // for every word String word = entry.getKey(); int tf = entry.getValue().x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // The original algorithm looks for all field names and finds the top frequency // and only consider this field for the query // "go through all the fields and find the largest document frequency" Term term = new Term(fieldName, word); int freq = indexReader.docFreq(new Term(fieldName, word)); if (minDocFreq > 0 && freq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (freq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (freq == 0) { continue; // index update problem? } float idf = similarity.idf(freq, numDocs); float score = tf * idf; // only really need 1st 3 entries, other ones are for troubleshooting res.insertWithOverflow(new Object[] { word, // the word fieldName, // the top field score, // overall score idf, // idf freq, // freq in all docs tf }); } return res; } /** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); char[] charBuffer = new char[0]; CharsRef outputReference = new CharsRef(); BytesRef text; while ((text = termsEnum.next()) != null) { charBuffer = ArrayUtil.grow(charBuffer, text.length); final int stringLenght = UnicodeUtil.UTF8toUTF16(text, charBuffer); outputReference.chars = charBuffer; outputReference.length = stringLenght; final String term = outputReference.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } } /** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext) throws IOException { String fieldName = fieldContext.getField(); Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class) .getAnalyzer(); if (!fieldContext.applyAnalyzer()) { // essentially does the Reader to String conversion for us analyzer = PassThroughAnalyzer.INSTANCE; } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } /** * determines if the passed term is likely to be of interest in "more like" comparisons * * @param term The word being considered * * @return true if should be ignored, false if should be used in further analysis */ private boolean isNoiseWord(String term) { int len = term.length(); if (minWordLen > 0 && len < minWordLen) { return true; } if (maxWordLen > 0 && len > maxWordLen) { return true; } return stopWords != null && stopWords.contains(term); } public MoreLikeThisBuilder fieldsContext(FieldsContext fieldsContext) { this.fieldsContext = fieldsContext; return this; } public MoreLikeThisBuilder input(Object input) { this.input = input; return this; } public MoreLikeThisBuilder queryContext(QueryBuildingContext queryContext) { this.queryContext = queryContext; return this; } public MoreLikeThisBuilder idAsTerm(String idAsTerm) { return this; } public MoreLikeThisBuilder inputType(ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE inputType) { this.inputType = inputType; return this; } /** * PriorityQueue that orders words by score. */ private static class FreqQ extends PriorityQueue<Object[]> { FreqQ(int s) { super(s); } @Override protected boolean lessThan(Object[] aa, Object[] bb) { Float fa = (Float) aa[2]; Float fb = (Float) bb[2]; return fa > fb; } } /** * Use for frequencies and to avoid renewing Integers. */ private static class Int { int x; Int() { x = 1; } @Override public String toString() { return "Int{" + x + '}'; } } }