List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis like
public Query like(String fieldName, Reader... readers) throws IOException
From source file:fr.univ_tours.etu.searcher.LikeThisTest.java
private void findSilimar(String searchForSimilar) throws IOException { IndexReader reader = DirectoryReader.open(indexDir); IndexSearcher indexSearcher = new IndexSearcher(reader); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setMinTermFreq(0);//w w w . j av a 2s . co m mlt.setMinDocFreq(0); mlt.setFieldNames(new String[] { "title", "content" }); mlt.setAnalyzer(analyzer); Reader sReader = new StringReader(searchForSimilar); Query query = mlt.like("content", sReader); TopDocs topDocs = indexSearcher.search(query, 10); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document aSimilar = indexSearcher.doc(scoreDoc.doc); String similarTitle = aSimilar.get("title"); String similarContent = aSimilar.get("content"); System.out.println("====similar finded===="); System.out.println("title: " + similarTitle); System.out.println("content: " + similarContent); } }
From source file:fr.univ_tours.etu.searcher.Searcher.java
public List<ResultObject> search(SearchQueriesRequest query) throws IOException, ParseException { Map<String, String> queriesDictionary = query.getQueriesDictionary(); boolean useQueryExpansion = query.isUseQueryExpansion(); List<Integer> docsToExpand = (useQueryExpansion) ? new ArrayList<>() : null; List<String> fsa = new ArrayList<>(); List<String> qsa = new ArrayList<>(); String contentLemmas = ""; if (queriesDictionary.containsKey(DocFields.CONTENTS)) { regularTokenizer.tokenize(queriesDictionary.get(DocFields.CONTENTS), true); caselessTokenizer.tokenize(queriesDictionary.get(DocFields.CONTENTS), true); contentLemmas = caselessTokenizer.getLemmaString(); System.out.println("Lemmas: " + caselessTokenizer.getLemmaList()); String neString = ""; if (caselessTokenizer.getNeList() != null && caselessTokenizer.getNeList().size() != 0) { neString = caselessTokenizer.getNeString(";", true); System.out.println("NE caseless: " + neString); }//w ww.j a v a 2s . c o m if (regularTokenizer.getNeList() != null && regularTokenizer.getNeList().size() != 0) { neString += ";" + regularTokenizer.getNeString(";", true); System.out.println("NE all: " + neString); } if (!"".equals(neString)) { fsa.add(DocFields.NAMED_ENTITIES); qsa.add(neString); } } for (Map.Entry<String, String> entry : queriesDictionary.entrySet()) { fsa.add(entry.getKey()); if (entry.getKey().equals(DocFields.CONTENTS) || entry.getKey().equals(DocFields.SYNONYMS)) { qsa.add(contentLemmas); } else { qsa.add(entry.getValue()); } } Query q = MultiFieldQueryParser.parse(qsa.toArray(new String[qsa.size()]), fsa.toArray(new String[fsa.size()]), analyzer); IndexSearcher searcher = new IndexSearcher(reader); TopDocs docs = searcher.search(q, this.numRetrievedDocs); ScoreDoc[] hits = docs.scoreDocs; List<ResultObject> resultObjects = new ArrayList<>(); String result = ""; for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; if (useQueryExpansion) { docsToExpand.add(docId); } Document d = searcher.doc(docId); resultObjects.add(new ResultObject(docId, i, d.get(DocFields.TITLE), d.get(DocFields.AUTHOR), d.get(DocFields.FILE_PATH), d.get(DocFields.SUMMARY), d.get(DocFields.FILE_NAME))); result = d.get(DocFields.SUMMARY); } if (useQueryExpansion) { reader.close(); this.reader = DirectoryReader.open(FSDirectory.open(new File(this.indexDir).toPath())); searcher = new IndexSearcher(reader); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setMinTermFreq(0); mlt.setMinDocFreq(0); mlt.setAnalyzer(analyzer); for (int i = 0; i < Math.min(docsToExpand.size(), 5); i++) { Reader r = new StringReader(resultObjects.get(i).getSummary()); Query expandedQuery = mlt.like(DocFields.CONTENTS, r); TopDocs topDocs = searcher.search(expandedQuery, 5); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { if (!docsToExpand.contains(scoreDoc.doc)) { docsToExpand.add(scoreDoc.doc); Document aSimilar = searcher.doc(scoreDoc.doc); resultObjects.add(new ResultObject(1, resultObjects.size(), aSimilar.get(DocFields.TITLE), aSimilar.get(DocFields.AUTHOR), aSimilar.get(DocFields.FILE_PATH), aSimilar.get(DocFields.SUMMARY), aSimilar.get(DocFields.FILE_NAME))); } else { } } } } return resultObjects; }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java
License:Apache License
public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) { Query moreLikeThisQuery = null; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(analyzer);/*from ww w.ja va 2s. c o m*/ try { String text = null; String[] fields = {}; for (String param : mltQueryString.split("&")) { String[] keyValuePair = param.split("="); if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) { throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString); } else { if ("stream.body".equals(keyValuePair[0])) { text = keyValuePair[1]; } else if ("mlt.fl".equals(keyValuePair[0])) { fields = keyValuePair[1].split(","); } else if ("mlt.mindf".equals(keyValuePair[0])) { mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.mintf".equals(keyValuePair[0])) { mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.boost".equals(keyValuePair[0])) { mlt.setBoost(Boolean.parseBoolean(keyValuePair[1])); } else if ("mlt.qf".equals(keyValuePair[0])) { mlt.setBoostFactor(Float.parseFloat(keyValuePair[1])); } else if ("mlt.maxdf".equals(keyValuePair[0])) { mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxdfp".equals(keyValuePair[0])) { mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxntp".equals(keyValuePair[0])) { mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxqt".equals(keyValuePair[0])) { mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1])); } else if ("mlt.maxwl".equals(keyValuePair[0])) { mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1])); } else if ("mlt.minwl".equals(keyValuePair[0])) { mlt.setMinWordLen(Integer.parseInt(keyValuePair[1])); } } } if (text != null) { if (FieldNames.PATH.equals(fields[0])) { IndexSearcher searcher = new IndexSearcher(reader); TermQuery q = new TermQuery(new Term(FieldNames.PATH, text)); TopDocs top = searcher.search(q, 1); if (top.totalHits == 0) { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } else { ScoreDoc d = top.scoreDocs[0]; Document doc = reader.document(d.doc); List<String> fieldNames = new ArrayList<String>(); for (IndexableField f : doc.getFields()) { if (!FieldNames.PATH.equals(f.name())) { fieldNames.add(f.name()); } } String[] docFields = fieldNames.toArray(new String[fieldNames.size()]); mlt.setFieldNames(docFields); moreLikeThisQuery = mlt.like(d.doc); } } else { mlt.setFieldNames(fields); moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]); } } return moreLikeThisQuery; } catch (Exception e) { throw new RuntimeException("could not handle MLT query " + mltQueryString); } }
From source file:org.elasticsearch.common.lucene.search.morelikethis.XMoreLikeThisTests.java
License:Apache License
@Test public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory();// w ww. j a va 2 s .co m RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] { "text" }); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf List<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); }
From source file:org.elasticsearch.common.lucene.search.MoreLikeThisQuery.java
License:Apache License
@Override public Query rewrite(IndexReader reader) throws IOException { MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); mlt.setFieldNames(moreLikeFields);/*from w w w.j av a 2 s .c o m*/ mlt.setAnalyzer(analyzer); mlt.setMinTermFreq(minTermFrequency); mlt.setMinDocFreq(minDocFreq); mlt.setMaxDocFreq(maxDocFreq); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinWordLen(minWordLen); mlt.setMaxWordLen(maxWordLen); mlt.setStopWords(stopWords); mlt.setBoost(boostTerms); mlt.setBoostFactor(boostTermsFactor); //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]); BooleanClause[] clauses = bq.getClauses(); bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); bq.setBoost(getBoost()); return bq; }
From source file:org.eu.bitzone.Leia.java
License:Apache License
/** More Like this query from the current doc (or selected fields) */ public void actionMLT(final Object docNum, final Object docTable) { if (ir == null) { errorMsg(MSG_NOINDEX);// ww w . jav a 2 s .c o m return; } int id = 0; try { id = Integer.parseInt(getString(docNum, "text")); } catch (final NumberFormatException nfe) { errorMsg("Invalid document number"); return; } final MoreLikeThis mlt = new MoreLikeThis(ir); try { mlt.setFieldNames(Util.fieldNames(ir, true).toArray(new String[0])); } catch (final Exception e) { errorMsg("Exception collecting field names: " + e.toString()); return; } mlt.setMinTermFreq(1); mlt.setMaxQueryTerms(50); final Analyzer a = createAnalyzer(find("srchOptTabs")); if (a == null) { return; } mlt.setAnalyzer(a); final Object[] rows = getSelectedItems(docTable); BooleanQuery similar = null; if (rows != null && rows.length > 0) { // collect text from fields final StringBuilder sb = new StringBuilder(); for (int i = 0; i < rows.length; i++) { final Field f = (Field) getProperty(rows[i], "field"); if (f == null) { continue; } final String s = f.stringValue(); if (s == null || s.trim().length() == 0) { continue; } if (sb.length() > 0) { sb.append(" "); } sb.append(s); } try { similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field"); } catch (final Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } else { try { similar = (BooleanQuery) mlt.like(id); } catch (final Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } if (similar.clauses() != null && similar.clauses().size() > 0) { // System.err.println("SIMILAR: " + similar); final Object tabpane = find("maintpane"); setInteger(tabpane, "selected", 2); final Object qField = find("qField"); setString(qField, "text", similar.toString()); } else { showStatus("WARN: empty query - check Analyzer settings"); } }
From source file:org.getopt.luke.Luke.java
License:Apache License
/** More Like this query from the current doc (or selected fields) */ public void actionMLT(Object docNum, Object docTable) { if (ir == null) { errorMsg(MSG_NOINDEX);// w w w. j a va 2 s.co m return; } int id = 0; try { id = Integer.parseInt(getString(docNum, "text")); } catch (NumberFormatException nfe) { errorMsg("Invalid document number"); return; } MoreLikeThis mlt = new MoreLikeThis(ir); try { mlt.setFieldNames((String[]) Util.fieldNames(ir, true).toArray(new String[0])); } catch (Exception e) { errorMsg("Exception collecting field names: " + e.toString()); return; } mlt.setMinTermFreq(1); mlt.setMaxQueryTerms(50); Analyzer a = createAnalyzer(find("srchOptTabs")); if (a == null) { return; } mlt.setAnalyzer(a); Object[] rows = getSelectedItems(docTable); BooleanQuery similar = null; if (rows != null && rows.length > 0) { // collect text from fields StringBuilder sb = new StringBuilder(); for (int i = 0; i < rows.length; i++) { Field f = (Field) getProperty(rows[i], "field"); if (f == null) { continue; } String s = f.stringValue(); if (s == null || s.trim().length() == 0) { continue; } if (sb.length() > 0) sb.append(" "); sb.append(s); } try { similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field"); } catch (Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } else { try { similar = (BooleanQuery) mlt.like(id); } catch (Exception e) { e.printStackTrace(); errorMsg("FAILED: " + e.getMessage()); return; } } if (similar.clauses() != null && similar.clauses().size() > 0) { //System.err.println("SIMILAR: " + similar); Object tabpane = find("maintpane"); setInteger(tabpane, "selected", 2); Object qField = find("qField"); setString(qField, "text", similar.toString()); } else { showStatus("WARN: empty query - check Analyzer settings"); } }
From source file:org.ohdsi.usagi.UsagiSearchEngine.java
License:Apache License
public List<ScoredConcept> search(String searchTerm, boolean useMlt, Collection<Integer> filterConceptIds, String filterDomain, String filterConceptClass, String filterVocabulary, boolean filterInvalid) { List<ScoredConcept> results = new ArrayList<ScoredConcept>(); try {/*from w ww. j a v a2 s . com*/ Query query; if (useMlt) { MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); mlt.setMaxDocFreq(9999); mlt.setMinWordLen(1); mlt.setMaxWordLen(9999); mlt.setMaxDocFreqPct(100); mlt.setMaxNumTokensParsed(9999); mlt.setMaxQueryTerms(9999); mlt.setStopWords(null); mlt.setFieldNames(new String[] { "TERM" }); mlt.setAnalyzer(analyzer); query = mlt.like("TERM", new StringReader(searchTerm)); } else { try { query = keywordsQueryParser.parse(searchTerm); // if (query instanceof BooleanQuery) { // List<BooleanClause> clauses = ((BooleanQuery) query).clauses(); // BooleanClause lastClause = clauses.get(clauses.size() - 1); // lastClause.setQuery(new PrefixQuery(((TermQuery) lastClause.getQuery()).getTerm())); // } else if (query instanceof TermQuery) {// It's a single term // query = new PrefixQuery(((TermQuery) query).getTerm()); // } } catch (ParseException e) { return results; } } BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(query, Occur.SHOULD); booleanQuery.add(conceptQuery, Occur.MUST); if (filterConceptIds != null && filterConceptIds.size() > 0) { Query conceptIdQuery = conceptIdQueryParser.parse(StringUtilities.join(filterConceptIds, " OR ")); booleanQuery.add(conceptIdQuery, Occur.MUST); } if (filterDomain != null) { Query domainQuery = domainQueryParser.parse("\"" + filterDomain + "\""); booleanQuery.add(domainQuery, Occur.MUST); } if (filterConceptClass != null) { Query conceptClassQuery = conceptClassQueryParser .parse("\"" + filterConceptClass.toString() + "\""); booleanQuery.add(conceptClassQuery, Occur.MUST); } if (filterVocabulary != null) { Query vocabularyQuery = vocabularyQueryParser.parse("\"" + filterVocabulary.toString() + "\""); booleanQuery.add(vocabularyQuery, Occur.MUST); } if (filterInvalid) { Query invalidQuery = invalidQueryParser.parse("\"\""); booleanQuery.add(invalidQuery, Occur.MUST); } TopDocs topDocs = searcher.search(booleanQuery, 100); recomputeScores(topDocs.scoreDocs, query); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = reader.document(scoreDoc.doc); int conceptId = Integer.parseInt(document.get("CONCEPT_ID")); // If matchscore = 0 but it was the one concept that was automatically selected, still allow it: if (scoreDoc.score > 0 || (filterConceptIds != null && filterConceptIds.size() == 1 && filterConceptIds.contains(conceptId))) { TargetConcept targetConcept = new TargetConcept(); targetConcept.term = document.get("TERM"); targetConcept.conceptId = conceptId; targetConcept.conceptName = document.get("CONCEPT_NAME"); targetConcept.conceptClass = document.get("CONCEPT_CLASS"); targetConcept.vocabulary = document.get("VOCABULARY"); targetConcept.conceptCode = document.get("CONCEPT_CODE"); targetConcept.validStartDate = document.get("VALID_START_DATE"); targetConcept.validEndDate = document.get("VALID_END_DATE"); targetConcept.invalidReason = document.get("INVALID_REASON"); for (String domain : document.get("DOMAINS").split("\n")) targetConcept.domains.add(domain); targetConcept.additionalInformation = document.get("ADDITIONAL_INFORMATION"); results.add(new ScoredConcept(scoreDoc.score, targetConcept)); } } reorderTies(results); removeDuplicateConcepts(results); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); } return results; }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java
License:Mozilla Public License
/** * run More Like This process on a document by comparing its description to * the description of all items of the training set<br/> * currently we get a fixed number of the top results * /*from ww w . j av a2s.co m*/ * @param document * document being tested * @return * @throws IOException */ public List<TSetBasedCategorisationResult> runMlt(Document document) { Map<String, TSetBasedCategorisationResult> result = null; IndexSearcher searcher = null; try { trainingSetSearcherManager.maybeRefresh(); // Boolean wasRefreshed = trainingSetSearcherManager.maybeRefresh(); // if (wasRefreshed) { // logger.debug(".runMlt: training set searcher had to be refreshed"); // } searcher = trainingSetSearcherManager.acquire(); // TODO TSETBASED refresh reader/searcher: Use readermanager and // refresh it? MoreLikeThis moreLikeThis = new MoreLikeThis(this.trainingSetIndexReader); moreLikeThis.setMinTermFreq(minTermFreq); moreLikeThis.setMinDocFreq(minDocFreq); moreLikeThis.setAnalyzer(this.trainingSetAnalyser); moreLikeThis.setFieldNames(fieldsToAnalyse.split(",")); moreLikeThis.setBoost(true); BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); for (String fieldName : fieldsToAnalyse.split(",")) { String value = document.get(fieldName); if (value != null && !"null".equals(value)) { switch (InformationAssetViewFields.valueOf(fieldName)) { case DESCRIPTION: moreLikeThis.setBoostFactor(descBoostingFactor); break; case TITLE: moreLikeThis.setBoostFactor(titleBoostingFactor); break; case CONTEXTDESCRIPTION: moreLikeThis.setBoostFactor(contextDescBoostingFactor); break; default: case SUBJECTS: case CORPBODYS: case PERSON_FULLNAME: case PLACE_NAME: moreLikeThis.setBoostFactor(1); break; } Query query = moreLikeThis.like(fieldName, new StringReader(value)); queryBuilder.add(query, Occur.SHOULD); } } BooleanQuery fullQuery = queryBuilder.build(); TopDocs topDocs = searcher.search(fullQuery, this.maximumSimilarElements); logger.debug(".runMlt: found {} total hits, processed at maximum {} hits", topDocs.totalHits, this.maximumSimilarElements); result = new LinkedHashMap<String, TSetBasedCategorisationResult>(); int size = 0; if (topDocs.totalHits <= this.maximumSimilarElements) { size = topDocs.totalHits - 1; } else { size = this.maximumSimilarElements - 1; } for (int i = 0; i < size; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; Float currrentScore = scoreDoc.score; if (currrentScore < this.mimimumScoreForMlt) { break; } Document hitDoc = searcher.doc(scoreDoc.doc); String category = hitDoc.get(InformationAssetViewFields.TAXONOMY.toString()); String docReference = hitDoc.get(InformationAssetViewFields.DOCREFERENCE.toString()); logger.debug(".runMlt: found doc, category: {}, score: {}, docreference: {}", category, currrentScore, docReference); TSetBasedCategorisationResult existingCategorisationResult = result.get(category); Float scoreToSet = currrentScore; Integer numberOfFoundDocuments = 1; // k nearest neighbour algorithm if (existingCategorisationResult != null) { scoreToSet += existingCategorisationResult.getScore(); numberOfFoundDocuments += existingCategorisationResult.getNumberOfFoundDocuments(); } result.put(category, new TSetBasedCategorisationResult(category, scoreToSet, numberOfFoundDocuments)); } } catch (IOException e) { throw new TaxonomyException(TaxonomyErrorType.LUCENE_IO_EXCEPTION, e); } finally { LuceneHelperTools.releaseSearcherManagerQuietly(trainingSetSearcherManager, searcher); } List<TSetBasedCategorisationResult> sortedResults = sortCategorisationResultsByScoreDescAndFilterByGlobalScore( new ArrayList<TSetBasedCategorisationResult>(result.values())); return sortedResults; }