List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis MoreLikeThis
public MoreLikeThis(IndexReader ir)
From source file:org.apache.solr.search.mlt.CloudMLTQParser.java
License:Apache License
public Query parse() { String id = localParams.get(QueryParsing.V); // Do a Real Time Get for the document SolrDocument doc = getDocument(id);//from w w w .j a v a 2 s . c om MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); // TODO: Are the mintf and mindf defaults ok at 1/0 ? mlt.setMinTermFreq(localParams.getInt("mintf", 1)); mlt.setMinDocFreq(localParams.getInt("mindf", 0)); if (localParams.get("minwl") != null) mlt.setMinWordLen(localParams.getInt("minwl")); if (localParams.get("maxwl") != null) mlt.setMaxWordLen(localParams.getInt("maxwl")); mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); String[] qf = localParams.getParams("qf"); Map<String, Collection<Object>> filteredDocument = new HashMap(); if (qf != null) { mlt.setFieldNames(qf); for (String field : qf) { filteredDocument.put(field, doc.getFieldValues(field)); } } else { Map<String, SchemaField> fields = req.getSchema().getFields(); ArrayList<String> fieldNames = new ArrayList(); for (String field : doc.getFieldNames()) { // Only use fields that are stored and have an explicit analyzer. // This makes sense as the query uses tf/idf/.. for query construction. // We might want to relook and change this in the future though. if (fields.get(field).stored() && fields.get(field).getType().isExplicitAnalyzer()) { fieldNames.add(field); filteredDocument.put(field, doc.getFieldValues(field)); } } mlt.setFieldNames(fieldNames.toArray(new String[fieldNames.size()])); } try { return mlt.like(filteredDocument); } catch (IOException e) { e.printStackTrace(); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request"); } }
From source file:org.apache.solr.search.mlt.SimpleMLTQParser.java
License:Apache License
public Query parse() { String defaultField = req.getSchema().getUniqueKeyField().getName(); String uniqueValue = localParams.get(QueryParsing.V); String[] qf = localParams.getParams("qf"); SolrIndexSearcher searcher = req.getSearcher(); Query docIdQuery = createIdQuery(defaultField, uniqueValue); try {//from w w w.j a v a 2 s. co m TopDocs td = searcher.search(docIdQuery, 1); if (td.totalHits != 1) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request. Could not fetch " + "document with id [" + uniqueValue + "]"); ScoreDoc[] scoreDocs = td.scoreDocs; MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); // TODO: Are the mintf and mindf defaults ok at '1' ? mlt.setMinTermFreq(localParams.getInt("mintf", 1)); mlt.setMinDocFreq(localParams.getInt("mindf", 1)); if (localParams.get("minwl") != null) mlt.setMinWordLen(localParams.getInt("minwl")); if (localParams.get("maxwl") != null) mlt.setMaxWordLen(localParams.getInt("maxwl")); ArrayList<String> fields = new ArrayList(); if (qf != null) { mlt.setFieldNames(qf); } else { Map<String, SchemaField> fieldNames = req.getSearcher().getSchema().getFields(); for (String fieldName : fieldNames.keySet()) { if (fieldNames.get(fieldName).indexed() && fieldNames.get(fieldName).stored()) if (fieldNames.get(fieldName).getType().getNumericType() == null) fields.add(fieldName); } mlt.setFieldNames(fields.toArray(new String[fields.size()])); } mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); return mlt.like(scoreDocs[0].doc); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request" + e.getMessage()); } }
From source file:org.cee.store.lucene.LuceneArticleStore.java
License:Apache License
private Query createRelatedArticlesQuery(List<EntityKey> sites, ArticleKey reference, IndexSearcher searcher, String language) throws IOException { Query articleQuery = createArticleQuery(reference); TopDocs topDocs = searcher.search(articleQuery, 1); if (topDocs.totalHits == 0) { return new BooleanQuery(true); }/*from w w w .ja v a 2s .co m*/ MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setFieldNames(LuceneConstants.ARTICLE_RELATED_SEARCH_FIELDS); mlt.setMaxQueryTerms(20); mlt.setBoost(true); mlt.setMinTermFreq(0); mlt.setMinDocFreq(0); Query relatedQuery = boostRelatedQuery(mlt.like(topDocs.scoreDocs[0].doc)); BooleanQuery query = new BooleanQuery(); query.add(new BooleanClause(relatedQuery, Occur.MUST)); query.add(new BooleanClause(createQueryArticlesOfSites(sites), Occur.MUST)); return query; }
From source file:org.elasticsearch.common.lucene.search.morelikethis.XMoreLikeThisTests.java
License:Apache License
@Test public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory();/*ww w . j av a 2s . c o m*/ RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] { "text" }); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf List<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); }
From source file:org.eu.bitzone.Leia.java
License:Apache License
/** More Like this query from the current doc (or selected fields) */ public void actionMLT(final Object docNum, final Object docTable) { if (ir == null) { errorMsg(MSG_NOINDEX);/*from w ww .ja va2 s . c o m*/ return; } int id = 0; try { id = Integer.parseInt(getString(docNum, "text")); } catch (final NumberFormatException nfe) { errorMsg("Invalid document number"); return; } final MoreLikeThis mlt = new MoreLikeThis(ir); try { mlt.setFieldNames(Util.fieldNames(ir, true).toArray(new String[0])); } catch (final Exception e) { errorMsg("Exception collecting field names: " + e.toString()); return; } mlt.setMinTermFreq(1); mlt.setMaxQueryTerms(50); final Analyzer a = createAnalyzer(find("srchOptTabs")); if (a == null) { return; } mlt.setAnalyzer(a); final Object[] rows = getSelectedItems(docTable); BooleanQuery similar = null; if (rows != null && rows.length > 0) { // collect text from fields final StringBuilder sb = new StringBuilder(); for (int i = 0; i < rows.length; i++) { final Field f = (Field) getProperty(rows[i], "field"); if (f == null) { continue; } final String s = f.stringValue(); if (s == null || s.trim().length() == 0) { continue; } if (sb.length() > 0) { sb.append(" "); } sb.append(s); } try { similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field"); } catch (final Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } else { try { similar = (BooleanQuery) mlt.like(id); } catch (final Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } if (similar.clauses() != null && similar.clauses().size() > 0) { // System.err.println("SIMILAR: " + similar); final Object tabpane = find("maintpane"); setInteger(tabpane, "selected", 2); final Object qField = find("qField"); setString(qField, "text", similar.toString()); } else { showStatus("WARN: empty query - check Analyzer settings"); } }
From source file:org.getopt.luke.Luke.java
License:Apache License
/** More Like this query from the current doc (or selected fields) */ public void actionMLT(Object docNum, Object docTable) { if (ir == null) { errorMsg(MSG_NOINDEX);//from w ww. j a v a 2 s . c o m return; } int id = 0; try { id = Integer.parseInt(getString(docNum, "text")); } catch (NumberFormatException nfe) { errorMsg("Invalid document number"); return; } MoreLikeThis mlt = new MoreLikeThis(ir); try { mlt.setFieldNames((String[]) Util.fieldNames(ir, true).toArray(new String[0])); } catch (Exception e) { errorMsg("Exception collecting field names: " + e.toString()); return; } mlt.setMinTermFreq(1); mlt.setMaxQueryTerms(50); Analyzer a = createAnalyzer(find("srchOptTabs")); if (a == null) { return; } mlt.setAnalyzer(a); Object[] rows = getSelectedItems(docTable); BooleanQuery similar = null; if (rows != null && rows.length > 0) { // collect text from fields StringBuilder sb = new StringBuilder(); for (int i = 0; i < rows.length; i++) { Field f = (Field) getProperty(rows[i], "field"); if (f == null) { continue; } String s = f.stringValue(); if (s == null || s.trim().length() == 0) { continue; } if (sb.length() > 0) sb.append(" "); sb.append(s); } try { similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field"); } catch (Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } else { try { similar = (BooleanQuery) mlt.like(id); } catch (Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } if (similar.clauses() != null && similar.clauses().size() > 0) { //System.err.println("SIMILAR: " + similar); Object tabpane = find("maintpane"); setInteger(tabpane, "selected", 2); Object qField = find("qField"); setString(qField, "text", similar.toString()); } else { showStatus("WARN: empty query - check Analyzer settings"); } }
From source file:org.ohdsi.usagi.UsagiSearchEngine.java
License:Apache License
public List<ScoredConcept> search(String searchTerm, boolean useMlt, Collection<Integer> filterConceptIds, String filterDomain, String filterConceptClass, String filterVocabulary, boolean filterInvalid) { List<ScoredConcept> results = new ArrayList<ScoredConcept>(); try {/*w ww . j a va 2s . c o m*/ Query query; if (useMlt) { MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); mlt.setMaxDocFreq(9999); mlt.setMinWordLen(1); mlt.setMaxWordLen(9999); mlt.setMaxDocFreqPct(100); mlt.setMaxNumTokensParsed(9999); mlt.setMaxQueryTerms(9999); mlt.setStopWords(null); mlt.setFieldNames(new String[] { "TERM" }); mlt.setAnalyzer(analyzer); query = mlt.like("TERM", new StringReader(searchTerm)); } else { try { query = keywordsQueryParser.parse(searchTerm); // if (query instanceof BooleanQuery) { // List<BooleanClause> clauses = ((BooleanQuery) query).clauses(); // BooleanClause lastClause = clauses.get(clauses.size() - 1); // lastClause.setQuery(new PrefixQuery(((TermQuery) lastClause.getQuery()).getTerm())); // } else if (query instanceof TermQuery) {// It's a single term // query = new PrefixQuery(((TermQuery) query).getTerm()); // } } catch (ParseException e) { return results; } } BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(query, Occur.SHOULD); booleanQuery.add(conceptQuery, Occur.MUST); if (filterConceptIds != null && filterConceptIds.size() > 0) { Query conceptIdQuery = conceptIdQueryParser.parse(StringUtilities.join(filterConceptIds, " OR ")); booleanQuery.add(conceptIdQuery, Occur.MUST); } if (filterDomain != null) { Query domainQuery = domainQueryParser.parse("\"" + filterDomain + "\""); booleanQuery.add(domainQuery, Occur.MUST); } if (filterConceptClass != null) { Query conceptClassQuery = conceptClassQueryParser .parse("\"" + filterConceptClass.toString() + "\""); booleanQuery.add(conceptClassQuery, Occur.MUST); } if (filterVocabulary != null) { Query vocabularyQuery = vocabularyQueryParser.parse("\"" + filterVocabulary.toString() + "\""); booleanQuery.add(vocabularyQuery, Occur.MUST); } if (filterInvalid) { Query invalidQuery = invalidQueryParser.parse("\"\""); booleanQuery.add(invalidQuery, Occur.MUST); } TopDocs topDocs = searcher.search(booleanQuery, 100); recomputeScores(topDocs.scoreDocs, query); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = reader.document(scoreDoc.doc); int conceptId = Integer.parseInt(document.get("CONCEPT_ID")); // If matchscore = 0 but it was the one concept that was automatically selected, still allow it: if (scoreDoc.score > 0 || (filterConceptIds != null && filterConceptIds.size() == 1 && filterConceptIds.contains(conceptId))) { TargetConcept targetConcept = new TargetConcept(); targetConcept.term = document.get("TERM"); targetConcept.conceptId = conceptId; targetConcept.conceptName = document.get("CONCEPT_NAME"); targetConcept.conceptClass = document.get("CONCEPT_CLASS"); targetConcept.vocabulary = document.get("VOCABULARY"); targetConcept.conceptCode = document.get("CONCEPT_CODE"); targetConcept.validStartDate = document.get("VALID_START_DATE"); targetConcept.validEndDate = document.get("VALID_END_DATE"); targetConcept.invalidReason = document.get("INVALID_REASON"); for (String domain : document.get("DOMAINS").split("\n")) targetConcept.domains.add(domain); targetConcept.additionalInformation = document.get("ADDITIONAL_INFORMATION"); results.add(new ScoredConcept(scoreDoc.score, targetConcept)); } } reorderTies(results); removeDuplicateConcepts(results); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); } return results; }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java
License:Mozilla Public License
/** * run More Like This process on a document by comparing its description to * the description of all items of the training set<br/> * currently we get a fixed number of the top results * /*from w w w .j av a2 s . c om*/ * @param document * document being tested * @return * @throws IOException */ public List<TSetBasedCategorisationResult> runMlt(Document document) { Map<String, TSetBasedCategorisationResult> result = null; IndexSearcher searcher = null; try { trainingSetSearcherManager.maybeRefresh(); // Boolean wasRefreshed = trainingSetSearcherManager.maybeRefresh(); // if (wasRefreshed) { // logger.debug(".runMlt: training set searcher had to be refreshed"); // } searcher = trainingSetSearcherManager.acquire(); // TODO TSETBASED refresh reader/searcher: Use readermanager and // refresh it? MoreLikeThis moreLikeThis = new MoreLikeThis(this.trainingSetIndexReader); moreLikeThis.setMinTermFreq(minTermFreq); moreLikeThis.setMinDocFreq(minDocFreq); moreLikeThis.setAnalyzer(this.trainingSetAnalyser); moreLikeThis.setFieldNames(fieldsToAnalyse.split(",")); moreLikeThis.setBoost(true); BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); for (String fieldName : fieldsToAnalyse.split(",")) { String value = document.get(fieldName); if (value != null && !"null".equals(value)) { switch (InformationAssetViewFields.valueOf(fieldName)) { case DESCRIPTION: moreLikeThis.setBoostFactor(descBoostingFactor); break; case TITLE: moreLikeThis.setBoostFactor(titleBoostingFactor); break; case CONTEXTDESCRIPTION: moreLikeThis.setBoostFactor(contextDescBoostingFactor); break; default: case SUBJECTS: case CORPBODYS: case PERSON_FULLNAME: case PLACE_NAME: moreLikeThis.setBoostFactor(1); break; } Query query = moreLikeThis.like(fieldName, new StringReader(value)); queryBuilder.add(query, Occur.SHOULD); } } BooleanQuery fullQuery = queryBuilder.build(); TopDocs topDocs = searcher.search(fullQuery, this.maximumSimilarElements); logger.debug(".runMlt: found {} total hits, processed at maximum {} hits", topDocs.totalHits, this.maximumSimilarElements); result = new LinkedHashMap<String, TSetBasedCategorisationResult>(); int size = 0; if (topDocs.totalHits <= this.maximumSimilarElements) { size = topDocs.totalHits - 1; } else { size = this.maximumSimilarElements - 1; } for (int i = 0; i < size; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; Float currrentScore = scoreDoc.score; if (currrentScore < this.mimimumScoreForMlt) { break; } Document hitDoc = searcher.doc(scoreDoc.doc); String category = hitDoc.get(InformationAssetViewFields.TAXONOMY.toString()); String docReference = hitDoc.get(InformationAssetViewFields.DOCREFERENCE.toString()); logger.debug(".runMlt: found doc, category: {}, score: {}, docreference: {}", category, currrentScore, docReference); TSetBasedCategorisationResult existingCategorisationResult = result.get(category); Float scoreToSet = currrentScore; Integer numberOfFoundDocuments = 1; // k nearest neighbour algorithm if (existingCategorisationResult != null) { scoreToSet += existingCategorisationResult.getScore(); numberOfFoundDocuments += existingCategorisationResult.getNumberOfFoundDocuments(); } result.put(category, new TSetBasedCategorisationResult(category, scoreToSet, numberOfFoundDocuments)); } } catch (IOException e) { throw new TaxonomyException(TaxonomyErrorType.LUCENE_IO_EXCEPTION, e); } finally { LuceneHelperTools.releaseSearcherManagerQuietly(trainingSetSearcherManager, searcher); } List<TSetBasedCategorisationResult> sortedResults = sortCategorisationResultsByScoreDescAndFilterByGlobalScore( new ArrayList<TSetBasedCategorisationResult>(result.values())); return sortedResults; }