List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis setBoostFactor
public void setBoostFactor(float boostFactor)
From source file:com.qwazr.search.query.MoreLikeThisQuery.java
License:Apache License
@Override final public Query getQuery(QueryContext queryContext) throws IOException, ParseException { Objects.requireNonNull(doc_num, "The doc_num field is missing"); final MoreLikeThis mlt = new MoreLikeThis(queryContext.indexSearcher.getIndexReader()); if (is_boost != null) mlt.setBoost(is_boost);/*from w ww .j a v a2 s. c om*/ if (boost_factor != null) mlt.setBoostFactor(boost_factor); if (fieldnames != null) mlt.setFieldNames(fieldnames); if (max_doc_freq != null) mlt.setMaxDocFreq(max_doc_freq); if (max_doc_freq_pct != null) mlt.setMaxDocFreqPct(max_doc_freq_pct); if (max_num_tokens_parsed != null) mlt.setMaxNumTokensParsed(max_num_tokens_parsed); if (max_query_terms != null) mlt.setMaxQueryTerms(max_query_terms); if (max_word_len != null) mlt.setMaxWordLen(max_word_len); if (min_doc_freq != null) mlt.setMinDocFreq(min_doc_freq); if (min_term_freq != null) mlt.setMinTermFreq(min_term_freq); if (min_word_len != null) mlt.setMinWordLen(min_word_len); if (stop_words != null) mlt.setStopWords(stop_words); mlt.setAnalyzer(queryContext.analyzer); return mlt.like(doc_num); }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java
License:Apache License
public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) { Query moreLikeThisQuery = null; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(analyzer);/*from w w w . j a va 2s .c o m*/ try { String text = null; String[] fields = {}; for (String param : mltQueryString.split("&")) { String[] keyValuePair = param.split("="); if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) { throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString); } else { if ("stream.body".equals(keyValuePair[0])) { text = keyValuePair[1]; } else if ("mlt.fl".equals(keyValuePair[0])) { fields = keyValuePair[1].split(","); } else if ("mlt.mindf".equals(keyValuePair[0])) { mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.mintf".equals(keyValuePair[0])) { mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.boost".equals(keyValuePair[0])) { mlt.setBoost(Boolean.parseBoolean(keyValuePair[1])); } else if ("mlt.qf".equals(keyValuePair[0])) { mlt.setBoostFactor(Float.parseFloat(keyValuePair[1])); } else if ("mlt.maxdf".equals(keyValuePair[0])) { mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxdfp".equals(keyValuePair[0])) { mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxntp".equals(keyValuePair[0])) { mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxqt".equals(keyValuePair[0])) { mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxwl".equals(keyValuePair[0])) { mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1])); } else if ("mlt.minwl".equals(keyValuePair[0])) { mlt.setMinWordLen(Integer.parseInt(keyValuePair[1])); } } } if (text != null) { if (FieldNames.PATH.equals(fields[0])) { IndexSearcher searcher = new IndexSearcher(reader); TermQuery q = new TermQuery(new Term(FieldNames.PATH, text)); TopDocs top = searcher.search(q, 1); if (top.totalHits == 0) { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } else { ScoreDoc d = top.scoreDocs[0]; Document doc = reader.document(d.doc); List<String> fieldNames = new ArrayList<String>(); for (IndexableField f : doc.getFields()) { if (!FieldNames.PATH.equals(f.name())) { fieldNames.add(f.name()); } } String[] docFields = fieldNames.toArray(new String[fieldNames.size()]); mlt.setFieldNames(docFields); moreLikeThisQuery = mlt.like(d.doc); } } else { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } } return moreLikeThisQuery; } catch (Exception e) { throw new RuntimeException("could not handle MLT query " + mltQueryString); } }
From source file:org.elasticsearch.common.lucene.search.MoreLikeThisQuery.java
License:Apache License
@Override public Query rewrite(IndexReader reader) throws IOException { MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); mlt.setFieldNames(moreLikeFields);/* ww w .j ava 2 s . co m*/ mlt.setAnalyzer(analyzer); mlt.setMinTermFreq(minTermFrequency); mlt.setMinDocFreq(minDocFreq); mlt.setMaxDocFreq(maxDocFreq); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinWordLen(minWordLen); mlt.setMaxWordLen(maxWordLen); mlt.setStopWords(stopWords); mlt.setBoost(boostTerms); mlt.setBoostFactor(boostTermsFactor); //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]); BooleanClause[] clauses = bq.getClauses(); bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); bq.setBoost(getBoost()); return bq; }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java
License:Mozilla Public License
/** * run More Like This process on a document by comparing its description to * the description of all items of the training set<br/> * currently we get a fixed number of the top results * //from ww w. j a v a 2 s .co m * @param document * document being tested * @return * @throws IOException */ public List<TSetBasedCategorisationResult> runMlt(Document document) { Map<String, TSetBasedCategorisationResult> result = null; IndexSearcher searcher = null; try { trainingSetSearcherManager.maybeRefresh(); // Boolean wasRefreshed = trainingSetSearcherManager.maybeRefresh(); // if (wasRefreshed) { // logger.debug(".runMlt: training set searcher had to be refreshed"); // } searcher = trainingSetSearcherManager.acquire(); // TODO TSETBASED refresh reader/searcher: Use readermanager and // refresh it? MoreLikeThis moreLikeThis = new MoreLikeThis(this.trainingSetIndexReader); moreLikeThis.setMinTermFreq(minTermFreq); moreLikeThis.setMinDocFreq(minDocFreq); moreLikeThis.setAnalyzer(this.trainingSetAnalyser); moreLikeThis.setFieldNames(fieldsToAnalyse.split(",")); moreLikeThis.setBoost(true); BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); for (String fieldName : fieldsToAnalyse.split(",")) { String value = document.get(fieldName); if (value != null && !"null".equals(value)) { switch (InformationAssetViewFields.valueOf(fieldName)) { case DESCRIPTION: moreLikeThis.setBoostFactor(descBoostingFactor); break; case TITLE: moreLikeThis.setBoostFactor(titleBoostingFactor); break; case CONTEXTDESCRIPTION: moreLikeThis.setBoostFactor(contextDescBoostingFactor); break; default: case SUBJECTS: case CORPBODYS: case PERSON_FULLNAME: case PLACE_NAME: moreLikeThis.setBoostFactor(1); break; } Query query = moreLikeThis.like(fieldName, new StringReader(value)); queryBuilder.add(query, Occur.SHOULD); } } BooleanQuery fullQuery = queryBuilder.build(); TopDocs topDocs = searcher.search(fullQuery, this.maximumSimilarElements); logger.debug(".runMlt: found {} total hits, processed at maximum {} hits", topDocs.totalHits, this.maximumSimilarElements); result = new LinkedHashMap<String, TSetBasedCategorisationResult>(); int size = 0; if (topDocs.totalHits <= this.maximumSimilarElements) { size = topDocs.totalHits - 1; } else { size = this.maximumSimilarElements - 1; } for (int i = 0; i < size; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; Float currrentScore = scoreDoc.score; if (currrentScore < this.mimimumScoreForMlt) { break; } Document hitDoc = searcher.doc(scoreDoc.doc); String category = hitDoc.get(InformationAssetViewFields.TAXONOMY.toString()); String docReference = hitDoc.get(InformationAssetViewFields.DOCREFERENCE.toString()); logger.debug(".runMlt: found doc, category: {}, score: {}, docreference: {}", category, currrentScore, docReference); TSetBasedCategorisationResult existingCategorisationResult = result.get(category); Float scoreToSet = currrentScore; Integer numberOfFoundDocuments = 1; // k nearest neighbour algorithm if (existingCategorisationResult != null) { scoreToSet += existingCategorisationResult.getScore(); numberOfFoundDocuments += existingCategorisationResult.getNumberOfFoundDocuments(); } result.put(category, new TSetBasedCategorisationResult(category, scoreToSet, numberOfFoundDocuments)); } } catch (IOException e) { throw new TaxonomyException(TaxonomyErrorType.LUCENE_IO_EXCEPTION, e); } finally { LuceneHelperTools.releaseSearcherManagerQuietly(trainingSetSearcherManager, searcher); } List<TSetBasedCategorisationResult> sortedResults = sortCategorisationResultsByScoreDescAndFilterByGlobalScore( new ArrayList<TSetBasedCategorisationResult>(result.values())); return sortedResults; }