List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis setMaxDocFreqPct
public void setMaxDocFreqPct(int maxPercentage)
From source file:com.qwazr.search.query.MoreLikeThisQuery.java
License:Apache License
@Override final public Query getQuery(QueryContext queryContext) throws IOException, ParseException { Objects.requireNonNull(doc_num, "The doc_num field is missing"); final MoreLikeThis mlt = new MoreLikeThis(queryContext.indexSearcher.getIndexReader()); if (is_boost != null) mlt.setBoost(is_boost);/*from w w w . j a v a2 s. co m*/ if (boost_factor != null) mlt.setBoostFactor(boost_factor); if (fieldnames != null) mlt.setFieldNames(fieldnames); if (max_doc_freq != null) mlt.setMaxDocFreq(max_doc_freq); if (max_doc_freq_pct != null) mlt.setMaxDocFreqPct(max_doc_freq_pct); if (max_num_tokens_parsed != null) mlt.setMaxNumTokensParsed(max_num_tokens_parsed); if (max_query_terms != null) mlt.setMaxQueryTerms(max_query_terms); if (max_word_len != null) mlt.setMaxWordLen(max_word_len); if (min_doc_freq != null) mlt.setMinDocFreq(min_doc_freq); if (min_term_freq != null) mlt.setMinTermFreq(min_term_freq); if (min_word_len != null) mlt.setMinWordLen(min_word_len); if (stop_words != null) mlt.setStopWords(stop_words); mlt.setAnalyzer(queryContext.analyzer); return mlt.like(doc_num); }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java
License:Apache License
public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) { Query moreLikeThisQuery = null; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(analyzer);/* w w w. ja v a2s.c o m*/ try { String text = null; String[] fields = {}; for (String param : mltQueryString.split("&")) { String[] keyValuePair = param.split("="); if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) { throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString); } else { if ("stream.body".equals(keyValuePair[0])) { text = keyValuePair[1]; } else if ("mlt.fl".equals(keyValuePair[0])) { fields = keyValuePair[1].split(","); } else if ("mlt.mindf".equals(keyValuePair[0])) { mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.mintf".equals(keyValuePair[0])) { mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.boost".equals(keyValuePair[0])) { mlt.setBoost(Boolean.parseBoolean(keyValuePair[1])); } else if ("mlt.qf".equals(keyValuePair[0])) { mlt.setBoostFactor(Float.parseFloat(keyValuePair[1])); } else if ("mlt.maxdf".equals(keyValuePair[0])) { mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxdfp".equals(keyValuePair[0])) { mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxntp".equals(keyValuePair[0])) { mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxqt".equals(keyValuePair[0])) { mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxwl".equals(keyValuePair[0])) { mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1])); } else if ("mlt.minwl".equals(keyValuePair[0])) { mlt.setMinWordLen(Integer.parseInt(keyValuePair[1])); } } } if (text != null) { if (FieldNames.PATH.equals(fields[0])) { IndexSearcher searcher = new IndexSearcher(reader); TermQuery q = new TermQuery(new Term(FieldNames.PATH, text)); TopDocs top = searcher.search(q, 1); if (top.totalHits == 0) { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } else { ScoreDoc d = top.scoreDocs[0]; Document doc = reader.document(d.doc); List<String> fieldNames = new ArrayList<String>(); for (IndexableField f : doc.getFields()) { if (!FieldNames.PATH.equals(f.name())) { fieldNames.add(f.name()); } } String[] docFields = fieldNames.toArray(new String[fieldNames.size()]); mlt.setFieldNames(docFields); moreLikeThisQuery = mlt.like(d.doc); } } else { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } } return moreLikeThisQuery; } catch (Exception e) { throw new RuntimeException("could not handle MLT query " + mltQueryString); } }
From source file:org.ohdsi.usagi.UsagiSearchEngine.java
License:Apache License
public List<ScoredConcept> search(String searchTerm, boolean useMlt, Collection<Integer> filterConceptIds, String filterDomain, String filterConceptClass, String filterVocabulary, boolean filterInvalid) { List<ScoredConcept> results = new ArrayList<ScoredConcept>(); try {/*from w w w. j a v a 2 s . co m*/ Query query; if (useMlt) { MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); mlt.setMaxDocFreq(9999); mlt.setMinWordLen(1); mlt.setMaxWordLen(9999); mlt.setMaxDocFreqPct(100); mlt.setMaxNumTokensParsed(9999); mlt.setMaxQueryTerms(9999); mlt.setStopWords(null); mlt.setFieldNames(new String[] { "TERM" }); mlt.setAnalyzer(analyzer); query = mlt.like("TERM", new StringReader(searchTerm)); } else { try { query = keywordsQueryParser.parse(searchTerm); // if (query instanceof BooleanQuery) { // List<BooleanClause> clauses = ((BooleanQuery) query).clauses(); // BooleanClause lastClause = clauses.get(clauses.size() - 1); // lastClause.setQuery(new PrefixQuery(((TermQuery) lastClause.getQuery()).getTerm())); // } else if (query instanceof TermQuery) {// It's a single term // query = new PrefixQuery(((TermQuery) query).getTerm()); // } } catch (ParseException e) { return results; } } BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(query, Occur.SHOULD); booleanQuery.add(conceptQuery, Occur.MUST); if (filterConceptIds != null && filterConceptIds.size() > 0) { Query conceptIdQuery = conceptIdQueryParser.parse(StringUtilities.join(filterConceptIds, " OR ")); booleanQuery.add(conceptIdQuery, Occur.MUST); } if (filterDomain != null) { Query domainQuery = domainQueryParser.parse("\"" + filterDomain + "\""); booleanQuery.add(domainQuery, Occur.MUST); } if (filterConceptClass != null) { Query conceptClassQuery = conceptClassQueryParser .parse("\"" + filterConceptClass.toString() + "\""); booleanQuery.add(conceptClassQuery, Occur.MUST); } if (filterVocabulary != null) { Query vocabularyQuery = vocabularyQueryParser.parse("\"" + filterVocabulary.toString() + "\""); booleanQuery.add(vocabularyQuery, Occur.MUST); } if (filterInvalid) { Query invalidQuery = invalidQueryParser.parse("\"\""); booleanQuery.add(invalidQuery, Occur.MUST); } TopDocs topDocs = searcher.search(booleanQuery, 100); recomputeScores(topDocs.scoreDocs, query); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = reader.document(scoreDoc.doc); int conceptId = Integer.parseInt(document.get("CONCEPT_ID")); // If matchscore = 0 but it was the one concept that was automatically selected, still allow it: if (scoreDoc.score > 0 || (filterConceptIds != null && filterConceptIds.size() == 1 && filterConceptIds.contains(conceptId))) { TargetConcept targetConcept = new TargetConcept(); targetConcept.term = document.get("TERM"); targetConcept.conceptId = conceptId; targetConcept.conceptName = document.get("CONCEPT_NAME"); targetConcept.conceptClass = document.get("CONCEPT_CLASS"); targetConcept.vocabulary = document.get("VOCABULARY"); targetConcept.conceptCode = document.get("CONCEPT_CODE"); targetConcept.validStartDate = document.get("VALID_START_DATE"); targetConcept.validEndDate = document.get("VALID_END_DATE"); targetConcept.invalidReason = document.get("INVALID_REASON"); for (String domain : document.get("DOMAINS").split("\n")) targetConcept.domains.add(domain); targetConcept.additionalInformation = document.get("ADDITIONAL_INFORMATION"); results.add(new ScoredConcept(scoreDoc.score, targetConcept)); } } reorderTies(results); removeDuplicateConcepts(results); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); } return results; }