List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis setMaxQueryTerms
public void setMaxQueryTerms(int maxQueryTerms)
From source file:com.qwazr.search.query.MoreLikeThisQuery.java
License:Apache License
@Override final public Query getQuery(QueryContext queryContext) throws IOException, ParseException { Objects.requireNonNull(doc_num, "The doc_num field is missing"); final MoreLikeThis mlt = new MoreLikeThis(queryContext.indexSearcher.getIndexReader()); if (is_boost != null) mlt.setBoost(is_boost);//from ww w . j av a2 s . c o m if (boost_factor != null) mlt.setBoostFactor(boost_factor); if (fieldnames != null) mlt.setFieldNames(fieldnames); if (max_doc_freq != null) mlt.setMaxDocFreq(max_doc_freq); if (max_doc_freq_pct != null) mlt.setMaxDocFreqPct(max_doc_freq_pct); if (max_num_tokens_parsed != null) mlt.setMaxNumTokensParsed(max_num_tokens_parsed); if (max_query_terms != null) mlt.setMaxQueryTerms(max_query_terms); if (max_word_len != null) mlt.setMaxWordLen(max_word_len); if (min_doc_freq != null) mlt.setMinDocFreq(min_doc_freq); if (min_term_freq != null) mlt.setMinTermFreq(min_term_freq); if (min_word_len != null) mlt.setMinWordLen(min_word_len); if (stop_words != null) mlt.setStopWords(stop_words); mlt.setAnalyzer(queryContext.analyzer); return mlt.like(doc_num); }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java
License:Apache License
public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) { Query moreLikeThisQuery = null; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(analyzer);/*from w w w . j a v a 2s .c o m*/ try { String text = null; String[] fields = {}; for (String param : mltQueryString.split("&")) { String[] keyValuePair = param.split("="); if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) { throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString); } else { if ("stream.body".equals(keyValuePair[0])) { text = keyValuePair[1]; } else if ("mlt.fl".equals(keyValuePair[0])) { fields = keyValuePair[1].split(","); } else if ("mlt.mindf".equals(keyValuePair[0])) { mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.mintf".equals(keyValuePair[0])) { mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.boost".equals(keyValuePair[0])) { mlt.setBoost(Boolean.parseBoolean(keyValuePair[1])); } else if ("mlt.qf".equals(keyValuePair[0])) { mlt.setBoostFactor(Float.parseFloat(keyValuePair[1])); } else if ("mlt.maxdf".equals(keyValuePair[0])) { mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxdfp".equals(keyValuePair[0])) { mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxntp".equals(keyValuePair[0])) { mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxqt".equals(keyValuePair[0])) { mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxwl".equals(keyValuePair[0])) { mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1])); } else if ("mlt.minwl".equals(keyValuePair[0])) { mlt.setMinWordLen(Integer.parseInt(keyValuePair[1])); } } } if (text != null) { if (FieldNames.PATH.equals(fields[0])) { IndexSearcher searcher = new IndexSearcher(reader); TermQuery q = new TermQuery(new Term(FieldNames.PATH, text)); TopDocs top = searcher.search(q, 1); if (top.totalHits == 0) { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } else { ScoreDoc d = top.scoreDocs[0]; Document doc = reader.document(d.doc); List<String> fieldNames = new ArrayList<String>(); for (IndexableField f : doc.getFields()) { if (!FieldNames.PATH.equals(f.name())) { fieldNames.add(f.name()); } } String[] docFields = fieldNames.toArray(new String[fieldNames.size()]); mlt.setFieldNames(docFields); moreLikeThisQuery = mlt.like(d.doc); } } else { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } } return moreLikeThisQuery; } catch (Exception e) { throw new RuntimeException("could not handle MLT query " + mltQueryString); } }
From source file:org.apache.solr.handler.RedbubbleMoreLikeThisHandler.java
License:Apache License
private void setMLTparams(SolrParams params, String[] similarityFields, MoreLikeThis mlt) { mlt.setMinTermFreq(params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThis.DEFAULT_MIN_TERM_FREQ)); mlt.setMinDocFreq(params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThis.DEFAULT_MIN_DOC_FREQ)); mlt.setMaxDocFreq(params.getInt(MoreLikeThisParams.MAX_DOC_FREQ, MoreLikeThis.DEFAULT_MAX_DOC_FREQ)); mlt.setMinWordLen(params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThis.DEFAULT_MIN_WORD_LENGTH)); mlt.setMaxWordLen(params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThis.DEFAULT_MAX_WORD_LENGTH)); mlt.setMaxQueryTerms( params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThis.DEFAULT_MAX_QUERY_TERMS)); mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED)); mlt.setBoost(params.getBool(MoreLikeThisParams.BOOST, false)); mlt.setFieldNames(similarityFields); }
From source file:org.cee.store.lucene.LuceneArticleStore.java
License:Apache License
private Query createRelatedArticlesQuery(List<EntityKey> sites, ArticleKey reference, IndexSearcher searcher, String language) throws IOException { Query articleQuery = createArticleQuery(reference); TopDocs topDocs = searcher.search(articleQuery, 1); if (topDocs.totalHits == 0) { return new BooleanQuery(true); }//w ww. j av a 2 s.c o m MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setFieldNames(LuceneConstants.ARTICLE_RELATED_SEARCH_FIELDS); mlt.setMaxQueryTerms(20); mlt.setBoost(true); mlt.setMinTermFreq(0); mlt.setMinDocFreq(0); Query relatedQuery = boostRelatedQuery(mlt.like(topDocs.scoreDocs[0].doc)); BooleanQuery query = new BooleanQuery(); query.add(new BooleanClause(relatedQuery, Occur.MUST)); query.add(new BooleanClause(createQueryArticlesOfSites(sites), Occur.MUST)); return query; }
From source file:org.elasticsearch.common.lucene.search.morelikethis.XMoreLikeThisTests.java
License:Apache License
@Test public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory();// w w w . j a v a2 s . c o m RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] { "text" }); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf List<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); }
From source file:org.elasticsearch.common.lucene.search.MoreLikeThisQuery.java
License:Apache License
@Override public Query rewrite(IndexReader reader) throws IOException { MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); mlt.setFieldNames(moreLikeFields);//from w w w . j a v a2 s . co m mlt.setAnalyzer(analyzer); mlt.setMinTermFreq(minTermFrequency); mlt.setMinDocFreq(minDocFreq); mlt.setMaxDocFreq(maxDocFreq); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinWordLen(minWordLen); mlt.setMaxWordLen(maxWordLen); mlt.setStopWords(stopWords); mlt.setBoost(boostTerms); mlt.setBoostFactor(boostTermsFactor); //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]); BooleanClause[] clauses = bq.getClauses(); bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); bq.setBoost(getBoost()); return bq; }
From source file:org.eu.bitzone.Leia.java
License:Apache License
/** More Like this query from the current doc (or selected fields) */ public void actionMLT(final Object docNum, final Object docTable) { if (ir == null) { errorMsg(MSG_NOINDEX);// www . j a v a 2s . co m return; } int id = 0; try { id = Integer.parseInt(getString(docNum, "text")); } catch (final NumberFormatException nfe) { errorMsg("Invalid document number"); return; } final MoreLikeThis mlt = new MoreLikeThis(ir); try { mlt.setFieldNames(Util.fieldNames(ir, true).toArray(new String[0])); } catch (final Exception e) { errorMsg("Exception collecting field names: " + e.toString()); return; } mlt.setMinTermFreq(1); mlt.setMaxQueryTerms(50); final Analyzer a = createAnalyzer(find("srchOptTabs")); if (a == null) { return; } mlt.setAnalyzer(a); final Object[] rows = getSelectedItems(docTable); BooleanQuery similar = null; if (rows != null && rows.length > 0) { // collect text from fields final StringBuilder sb = new StringBuilder(); for (int i = 0; i < rows.length; i++) { final Field f = (Field) getProperty(rows[i], "field"); if (f == null) { continue; } final String s = f.stringValue(); if (s == null || s.trim().length() == 0) { continue; } if (sb.length() > 0) { sb.append(" "); } sb.append(s); } try { similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field"); } catch (final Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } else { try { similar = (BooleanQuery) mlt.like(id); } catch (final Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } if (similar.clauses() != null && similar.clauses().size() > 0) { // System.err.println("SIMILAR: " + similar); final Object tabpane = find("maintpane"); setInteger(tabpane, "selected", 2); final Object qField = find("qField"); setString(qField, "text", similar.toString()); } else { showStatus("WARN: empty query - check Analyzer settings"); } }
From source file:org.getopt.luke.Luke.java
License:Apache License
/** More Like this query from the current doc (or selected fields) */ public void actionMLT(Object docNum, Object docTable) { if (ir == null) { errorMsg(MSG_NOINDEX);// w w w. j a v a 2s . co m return; } int id = 0; try { id = Integer.parseInt(getString(docNum, "text")); } catch (NumberFormatException nfe) { errorMsg("Invalid document number"); return; } MoreLikeThis mlt = new MoreLikeThis(ir); try { mlt.setFieldNames((String[]) Util.fieldNames(ir, true).toArray(new String[0])); } catch (Exception e) { errorMsg("Exception collecting field names: " + e.toString()); return; } mlt.setMinTermFreq(1); mlt.setMaxQueryTerms(50); Analyzer a = createAnalyzer(find("srchOptTabs")); if (a == null) { return; } mlt.setAnalyzer(a); Object[] rows = getSelectedItems(docTable); BooleanQuery similar = null; if (rows != null && rows.length > 0) { // collect text from fields StringBuilder sb = new StringBuilder(); for (int i = 0; i < rows.length; i++) { Field f = (Field) getProperty(rows[i], "field"); if (f == null) { continue; } String s = f.stringValue(); if (s == null || s.trim().length() == 0) { continue; } if (sb.length() > 0) sb.append(" "); sb.append(s); } try { similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field"); } catch (Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } else { try { similar = (BooleanQuery) mlt.like(id); } catch (Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } if (similar.clauses() != null && similar.clauses().size() > 0) { //System.err.println("SIMILAR: " + similar); Object tabpane = find("maintpane"); setInteger(tabpane, "selected", 2); Object qField = find("qField"); setString(qField, "text", similar.toString()); } else { showStatus("WARN: empty query - check Analyzer settings"); } }
From source file:org.ohdsi.usagi.UsagiSearchEngine.java
License:Apache License
public List<ScoredConcept> search(String searchTerm, boolean useMlt, Collection<Integer> filterConceptIds, String filterDomain, String filterConceptClass, String filterVocabulary, boolean filterInvalid) { List<ScoredConcept> results = new ArrayList<ScoredConcept>(); try {/* w w w . j a v a2 s . c o m*/ Query query; if (useMlt) { MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); mlt.setMaxDocFreq(9999); mlt.setMinWordLen(1); mlt.setMaxWordLen(9999); mlt.setMaxDocFreqPct(100); mlt.setMaxNumTokensParsed(9999); mlt.setMaxQueryTerms(9999); mlt.setStopWords(null); mlt.setFieldNames(new String[] { "TERM" }); mlt.setAnalyzer(analyzer); query = mlt.like("TERM", new StringReader(searchTerm)); } else { try { query = keywordsQueryParser.parse(searchTerm); // if (query instanceof BooleanQuery) { // List<BooleanClause> clauses = ((BooleanQuery) query).clauses(); // BooleanClause lastClause = clauses.get(clauses.size() - 1); // lastClause.setQuery(new PrefixQuery(((TermQuery) lastClause.getQuery()).getTerm())); // } else if (query instanceof TermQuery) {// It's a single term // query = new PrefixQuery(((TermQuery) query).getTerm()); // } } catch (ParseException e) { return results; } } BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(query, Occur.SHOULD); booleanQuery.add(conceptQuery, Occur.MUST); if (filterConceptIds != null && filterConceptIds.size() > 0) { Query conceptIdQuery = conceptIdQueryParser.parse(StringUtilities.join(filterConceptIds, " OR ")); booleanQuery.add(conceptIdQuery, Occur.MUST); } if (filterDomain != null) { Query domainQuery = domainQueryParser.parse("\"" + filterDomain + "\""); booleanQuery.add(domainQuery, Occur.MUST); } if (filterConceptClass != null) { Query conceptClassQuery = conceptClassQueryParser .parse("\"" + filterConceptClass.toString() + "\""); booleanQuery.add(conceptClassQuery, Occur.MUST); } if (filterVocabulary != null) { Query vocabularyQuery = vocabularyQueryParser.parse("\"" + filterVocabulary.toString() + "\""); booleanQuery.add(vocabularyQuery, Occur.MUST); } if (filterInvalid) { Query invalidQuery = invalidQueryParser.parse("\"\""); booleanQuery.add(invalidQuery, Occur.MUST); } TopDocs topDocs = searcher.search(booleanQuery, 100); recomputeScores(topDocs.scoreDocs, query); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = reader.document(scoreDoc.doc); int conceptId = Integer.parseInt(document.get("CONCEPT_ID")); // If matchscore = 0 but it was the one concept that was automatically selected, still allow it: if (scoreDoc.score > 0 || (filterConceptIds != null && filterConceptIds.size() == 1 && filterConceptIds.contains(conceptId))) { TargetConcept targetConcept = new TargetConcept(); targetConcept.term = document.get("TERM"); targetConcept.conceptId = conceptId; targetConcept.conceptName = document.get("CONCEPT_NAME"); targetConcept.conceptClass = document.get("CONCEPT_CLASS"); targetConcept.vocabulary = document.get("VOCABULARY"); targetConcept.conceptCode = document.get("CONCEPT_CODE"); targetConcept.validStartDate = document.get("VALID_START_DATE"); targetConcept.validEndDate = document.get("VALID_END_DATE"); targetConcept.invalidReason = document.get("INVALID_REASON"); for (String domain : document.get("DOMAINS").split("\n")) targetConcept.domains.add(domain); targetConcept.additionalInformation = document.get("ADDITIONAL_INFORMATION"); results.add(new ScoredConcept(scoreDoc.score, targetConcept)); } } reorderTies(results); removeDuplicateConcepts(results); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); } return results; }