List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis like
public Query like(Map<String, Collection<Object>> filteredDocument) throws IOException
From source file:aos.lucene.tools.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1);/*from w w w. j a va 2s. c om*/ mlt.setMinDocFreq(1); for (int docID = 0; docID < numDocs; docID++) { LOGGER.info(); Document doc = reader.document(docID); LOGGER.info(doc.get("title")); Query query = mlt.like(docID); LOGGER.info(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) LOGGER.info(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { doc = reader.document(similarDocs.scoreDocs[i].doc); LOGGER.info(" -> " + doc.getField("title").stringValue()); } } } reader.close(); directory.close(); }
From source file:com.mathworks.xzheng.tools.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); // #A mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1); // #B mlt.setMinDocFreq(1);/* ww w . j a va 2 s . c o m*/ for (int docID = 0; docID < numDocs; docID++) { // #C System.out.println(); Document doc = reader.document(docID); System.out.println(doc.get("title")); Query query = mlt.like(docID); // #D System.out.println(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) System.out.println(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { // #E doc = reader.document(similarDocs.scoreDocs[i].doc); System.out.println(" -> " + doc.getField("title").stringValue()); } } } reader.close(); directory.close(); }
From source file:com.qwazr.search.query.MoreLikeThisQuery.java
License:Apache License
@Override final public Query getQuery(QueryContext queryContext) throws IOException, ParseException { Objects.requireNonNull(doc_num, "The doc_num field is missing"); final MoreLikeThis mlt = new MoreLikeThis(queryContext.indexSearcher.getIndexReader()); if (is_boost != null) mlt.setBoost(is_boost);/*from w w w. java 2s . c o m*/ if (boost_factor != null) mlt.setBoostFactor(boost_factor); if (fieldnames != null) mlt.setFieldNames(fieldnames); if (max_doc_freq != null) mlt.setMaxDocFreq(max_doc_freq); if (max_doc_freq_pct != null) mlt.setMaxDocFreqPct(max_doc_freq_pct); if (max_num_tokens_parsed != null) mlt.setMaxNumTokensParsed(max_num_tokens_parsed); if (max_query_terms != null) mlt.setMaxQueryTerms(max_query_terms); if (max_word_len != null) mlt.setMaxWordLen(max_word_len); if (min_doc_freq != null) mlt.setMinDocFreq(min_doc_freq); if (min_term_freq != null) mlt.setMinTermFreq(min_term_freq); if (min_word_len != null) mlt.setMinWordLen(min_word_len); if (stop_words != null) mlt.setStopWords(stop_words); mlt.setAnalyzer(queryContext.analyzer); return mlt.like(doc_num); }
From source file:de.mirkosertic.desktopsearch.LuceneIndexHandler.java
License:Open Source License
public QueryResult performQuery(String aQueryString, String aBacklink, String aBasePath, Configuration aConfiguration, Map<String, String> aDrilldownFields) throws IOException { searcherManager.maybeRefreshBlocking(); IndexSearcher theSearcher = searcherManager.acquire(); SortedSetDocValuesReaderState theSortedSetState = new DefaultSortedSetDocValuesReaderState( theSearcher.getIndexReader()); List<QueryResultDocument> theResultDocuments = new ArrayList<>(); long theStartTime = System.currentTimeMillis(); LOGGER.info("Querying for " + aQueryString); DateFormat theDateFormat = new SimpleDateFormat("dd.MMMM.yyyy", Locale.ENGLISH); try {//from ww w .ja va 2 s. c o m List<FacetDimension> theDimensions = new ArrayList<>(); // Search only if a search query is given if (!StringUtils.isEmpty(aQueryString)) { Query theQuery = computeBooleanQueryFor(aQueryString); LOGGER.info(" query is " + theQuery); theQuery = theQuery.rewrite(theSearcher.getIndexReader()); LOGGER.info(" rewritten query is " + theQuery); DrillDownQuery theDrilldownQuery = new DrillDownQuery(facetsConfig, theQuery); aDrilldownFields.entrySet().stream().forEach(aEntry -> { LOGGER.info(" with Drilldown " + aEntry.getKey() + " for " + aEntry.getValue()); theDrilldownQuery.add(aEntry.getKey(), aEntry.getValue()); }); FacetsCollector theFacetCollector = new FacetsCollector(); TopDocs theDocs = FacetsCollector.search(theSearcher, theDrilldownQuery, null, aConfiguration.getNumberOfSearchResults(), theFacetCollector); SortedSetDocValuesFacetCounts theFacetCounts = new SortedSetDocValuesFacetCounts(theSortedSetState, theFacetCollector); List<Facet> theAuthorFacets = new ArrayList<>(); List<Facet> theFileTypesFacets = new ArrayList<>(); List<Facet> theLastModifiedYearFacet = new ArrayList<>(); List<Facet> theLanguageFacet = new ArrayList<>(); LOGGER.info("Found " + theDocs.scoreDocs.length + " documents"); // We need this cache to detect duplicate documents while searching for similarities Set<Integer> theUniqueDocumentsFound = new HashSet<>(); Map<String, QueryResultDocument> theDocumentsByHash = new HashMap<>(); for (int i = 0; i < theDocs.scoreDocs.length; i++) { int theDocumentID = theDocs.scoreDocs[i].doc; theUniqueDocumentsFound.add(theDocumentID); Document theDocument = theSearcher.doc(theDocumentID); String theUniqueID = theDocument.getField(IndexFields.UNIQUEID).stringValue(); String theFoundFileName = theDocument.getField(IndexFields.FILENAME).stringValue(); String theHash = theDocument.getField(IndexFields.CONTENTMD5).stringValue(); QueryResultDocument theExistingDocument = theDocumentsByHash.get(theHash); if (theExistingDocument != null) { theExistingDocument.addFileName(theFoundFileName); } else { Date theLastModified = new Date( theDocument.getField(IndexFields.LASTMODIFIED).numericValue().longValue()); SupportedLanguage theLanguage = SupportedLanguage .valueOf(theDocument.getField(IndexFields.LANGUAGESTORED).stringValue()); String theFieldName; if (analyzerCache.supportsLanguage(theLanguage)) { theFieldName = analyzerCache.getFieldNameFor(theLanguage); } else { theFieldName = IndexFields.CONTENT; } String theOriginalContent = theDocument.getField(theFieldName).stringValue(); final Query theFinalQuery = theQuery; ForkJoinTask<String> theHighligherResult = executorPool.submit(() -> { StringBuilder theResult = new StringBuilder(theDateFormat.format(theLastModified)); theResult.append(" - "); Highlighter theHighlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(theFinalQuery)); for (String theFragment : theHighlighter.getBestFragments(analyzer, theFieldName, theOriginalContent, NUMBER_OF_FRAGMENTS)) { if (theResult.length() > 0) { theResult = theResult.append("..."); } theResult = theResult.append(theFragment); } return theResult.toString(); }); int theNormalizedScore = (int) (theDocs.scoreDocs[i].score / theDocs.getMaxScore() * 5); File theFileOnDisk = new File(theFoundFileName); if (theFileOnDisk.exists()) { boolean thePreviewAvailable = previewProcessor.previewAvailableFor(theFileOnDisk); theExistingDocument = new QueryResultDocument(theDocumentID, theFoundFileName, theHighligherResult, Long.parseLong(theDocument.getField(IndexFields.LASTMODIFIED).stringValue()), theNormalizedScore, theUniqueID, thePreviewAvailable); theDocumentsByHash.put(theHash, theExistingDocument); theResultDocuments.add(theExistingDocument); } } } if (aConfiguration.isShowSimilarDocuments()) { MoreLikeThis theMoreLikeThis = new MoreLikeThis(theSearcher.getIndexReader()); theMoreLikeThis.setAnalyzer(analyzer); theMoreLikeThis.setMinTermFreq(1); theMoreLikeThis.setMinDocFreq(1); theMoreLikeThis.setFieldNames(analyzerCache.getAllFieldNames()); for (QueryResultDocument theDocument : theResultDocuments) { Query theMoreLikeThisQuery = theMoreLikeThis.like(theDocument.getDocumentID()); TopDocs theMoreLikeThisTopDocs = theSearcher.search(theMoreLikeThisQuery, 5); for (ScoreDoc theMoreLikeThisScoreDoc : theMoreLikeThisTopDocs.scoreDocs) { int theSimilarDocument = theMoreLikeThisScoreDoc.doc; if (theUniqueDocumentsFound.add(theSimilarDocument)) { Document theMoreLikeThisDocument = theSearcher.doc(theSimilarDocument); String theFilename = theMoreLikeThisDocument.getField(IndexFields.FILENAME) .stringValue(); theDocument.addSimilarFile(theFilename); } } } } LOGGER.info("Got Dimensions"); for (FacetResult theResult : theFacetCounts.getAllDims(20000)) { String theDimension = theResult.dim; if ("author".equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { theAuthorFacets.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } if ("extension".equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { theFileTypesFacets.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } if ("last-modified-year".equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { theLastModifiedYearFacet.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } if (IndexFields.LANGUAGEFACET.equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { Locale theLocale = new Locale(theLabelAndValue.label); theLanguageFacet.add(new Facet(theLocale.getDisplayLanguage(Locale.ENGLISH), theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } LOGGER.info(" " + theDimension); } if (!theAuthorFacets.isEmpty()) { theDimensions.add(new FacetDimension("Author", theAuthorFacets)); } if (!theLastModifiedYearFacet.isEmpty()) { theDimensions.add(new FacetDimension("Last modified", theLastModifiedYearFacet)); } if (!theFileTypesFacets.isEmpty()) { theDimensions.add(new FacetDimension("File types", theFileTypesFacets)); } if (!theLanguageFacet.isEmpty()) { theDimensions.add(new FacetDimension("Language", theLanguageFacet)); } // Wait for all Tasks to complete for the search result highlighter ForkJoinTask.helpQuiesce(); } long theDuration = System.currentTimeMillis() - theStartTime; LOGGER.info("Total amount of time : " + theDuration + "ms"); return new QueryResult(System.currentTimeMillis() - theStartTime, theResultDocuments, theDimensions, theSearcher.getIndexReader().numDocs(), aBacklink); } catch (Exception e) { throw new RuntimeException(e); } finally { searcherManager.release(theSearcher); } }
From source file:org.apache.solr.search.mlt.CloudMLTQParser.java
License:Apache License
public Query parse() { String id = localParams.get(QueryParsing.V); // Do a Real Time Get for the document SolrDocument doc = getDocument(id);//from w ww . jav a 2 s .c om MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); // TODO: Are the mintf and mindf defaults ok at 1/0 ? mlt.setMinTermFreq(localParams.getInt("mintf", 1)); mlt.setMinDocFreq(localParams.getInt("mindf", 0)); if (localParams.get("minwl") != null) mlt.setMinWordLen(localParams.getInt("minwl")); if (localParams.get("maxwl") != null) mlt.setMaxWordLen(localParams.getInt("maxwl")); mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); String[] qf = localParams.getParams("qf"); Map<String, Collection<Object>> filteredDocument = new HashMap(); if (qf != null) { mlt.setFieldNames(qf); for (String field : qf) { filteredDocument.put(field, doc.getFieldValues(field)); } } else { Map<String, SchemaField> fields = req.getSchema().getFields(); ArrayList<String> fieldNames = new ArrayList(); for (String field : doc.getFieldNames()) { // Only use fields that are stored and have an explicit analyzer. // This makes sense as the query uses tf/idf/.. for query construction. // We might want to relook and change this in the future though. if (fields.get(field).stored() && fields.get(field).getType().isExplicitAnalyzer()) { fieldNames.add(field); filteredDocument.put(field, doc.getFieldValues(field)); } } mlt.setFieldNames(fieldNames.toArray(new String[fieldNames.size()])); } try { return mlt.like(filteredDocument); } catch (IOException e) { e.printStackTrace(); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request"); } }
From source file:org.apache.solr.search.mlt.SimpleMLTQParser.java
License:Apache License
public Query parse() { String defaultField = req.getSchema().getUniqueKeyField().getName(); String uniqueValue = localParams.get(QueryParsing.V); String[] qf = localParams.getParams("qf"); SolrIndexSearcher searcher = req.getSearcher(); Query docIdQuery = createIdQuery(defaultField, uniqueValue); try {/* ww w . ja v a 2 s . c o m*/ TopDocs td = searcher.search(docIdQuery, 1); if (td.totalHits != 1) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request. Could not fetch " + "document with id [" + uniqueValue + "]"); ScoreDoc[] scoreDocs = td.scoreDocs; MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader()); // TODO: Are the mintf and mindf defaults ok at '1' ? mlt.setMinTermFreq(localParams.getInt("mintf", 1)); mlt.setMinDocFreq(localParams.getInt("mindf", 1)); if (localParams.get("minwl") != null) mlt.setMinWordLen(localParams.getInt("minwl")); if (localParams.get("maxwl") != null) mlt.setMaxWordLen(localParams.getInt("maxwl")); ArrayList<String> fields = new ArrayList(); if (qf != null) { mlt.setFieldNames(qf); } else { Map<String, SchemaField> fieldNames = req.getSearcher().getSchema().getFields(); for (String fieldName : fieldNames.keySet()) { if (fieldNames.get(fieldName).indexed() && fieldNames.get(fieldName).stored()) if (fieldNames.get(fieldName).getType().getNumericType() == null) fields.add(fieldName); } mlt.setFieldNames(fields.toArray(new String[fields.size()])); } mlt.setAnalyzer(req.getSchema().getIndexAnalyzer()); return mlt.like(scoreDocs[0].doc); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error completing MLT request" + e.getMessage()); } }
From source file:org.cee.store.lucene.LuceneArticleStore.java
License:Apache License
private Query createRelatedArticlesQuery(List<EntityKey> sites, ArticleKey reference, IndexSearcher searcher, String language) throws IOException { Query articleQuery = createArticleQuery(reference); TopDocs topDocs = searcher.search(articleQuery, 1); if (topDocs.totalHits == 0) { return new BooleanQuery(true); }//ww w .j a v a 2s .c om MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setFieldNames(LuceneConstants.ARTICLE_RELATED_SEARCH_FIELDS); mlt.setMaxQueryTerms(20); mlt.setBoost(true); mlt.setMinTermFreq(0); mlt.setMinDocFreq(0); Query relatedQuery = boostRelatedQuery(mlt.like(topDocs.scoreDocs[0].doc)); BooleanQuery query = new BooleanQuery(); query.add(new BooleanClause(relatedQuery, Occur.MUST)); query.add(new BooleanClause(createQueryArticlesOfSites(sites), Occur.MUST)); return query; }