List of usage examples for org.apache.lucene.search IndexSearcher doc
public Document doc(int docID) throws IOException
.getIndexReader().document(docID)
From source file:de.mirkosertic.desktopsearch.LuceneIndexHandler.java
License:Open Source License
public QueryResult performQuery(String aQueryString, String aBacklink, String aBasePath, Configuration aConfiguration, Map<String, String> aDrilldownFields) throws IOException { searcherManager.maybeRefreshBlocking(); IndexSearcher theSearcher = searcherManager.acquire(); SortedSetDocValuesReaderState theSortedSetState = new DefaultSortedSetDocValuesReaderState( theSearcher.getIndexReader()); List<QueryResultDocument> theResultDocuments = new ArrayList<>(); long theStartTime = System.currentTimeMillis(); LOGGER.info("Querying for " + aQueryString); DateFormat theDateFormat = new SimpleDateFormat("dd.MMMM.yyyy", Locale.ENGLISH); try {/*ww w . jav a 2 s .c o m*/ List<FacetDimension> theDimensions = new ArrayList<>(); // Search only if a search query is given if (!StringUtils.isEmpty(aQueryString)) { Query theQuery = computeBooleanQueryFor(aQueryString); LOGGER.info(" query is " + theQuery); theQuery = theQuery.rewrite(theSearcher.getIndexReader()); LOGGER.info(" rewritten query is " + theQuery); DrillDownQuery theDrilldownQuery = new DrillDownQuery(facetsConfig, theQuery); aDrilldownFields.entrySet().stream().forEach(aEntry -> { LOGGER.info(" with Drilldown " + aEntry.getKey() + " for " + aEntry.getValue()); theDrilldownQuery.add(aEntry.getKey(), aEntry.getValue()); }); FacetsCollector theFacetCollector = new FacetsCollector(); TopDocs theDocs = FacetsCollector.search(theSearcher, theDrilldownQuery, null, aConfiguration.getNumberOfSearchResults(), theFacetCollector); SortedSetDocValuesFacetCounts theFacetCounts = new SortedSetDocValuesFacetCounts(theSortedSetState, theFacetCollector); List<Facet> theAuthorFacets = new ArrayList<>(); List<Facet> theFileTypesFacets = new ArrayList<>(); List<Facet> theLastModifiedYearFacet = new ArrayList<>(); List<Facet> theLanguageFacet = new ArrayList<>(); LOGGER.info("Found " + theDocs.scoreDocs.length + " documents"); // We need this cache to detect duplicate documents while searching for similarities Set<Integer> theUniqueDocumentsFound = new HashSet<>(); Map<String, QueryResultDocument> theDocumentsByHash = new HashMap<>(); for (int i = 0; i < theDocs.scoreDocs.length; i++) { int theDocumentID = theDocs.scoreDocs[i].doc; theUniqueDocumentsFound.add(theDocumentID); Document theDocument = theSearcher.doc(theDocumentID); String theUniqueID = theDocument.getField(IndexFields.UNIQUEID).stringValue(); String theFoundFileName = theDocument.getField(IndexFields.FILENAME).stringValue(); String theHash = theDocument.getField(IndexFields.CONTENTMD5).stringValue(); QueryResultDocument theExistingDocument = theDocumentsByHash.get(theHash); if (theExistingDocument != null) { theExistingDocument.addFileName(theFoundFileName); } else { Date theLastModified = new Date( theDocument.getField(IndexFields.LASTMODIFIED).numericValue().longValue()); SupportedLanguage theLanguage = SupportedLanguage .valueOf(theDocument.getField(IndexFields.LANGUAGESTORED).stringValue()); String theFieldName; if (analyzerCache.supportsLanguage(theLanguage)) { theFieldName = analyzerCache.getFieldNameFor(theLanguage); } else { theFieldName = IndexFields.CONTENT; } String theOriginalContent = theDocument.getField(theFieldName).stringValue(); final Query theFinalQuery = theQuery; ForkJoinTask<String> theHighligherResult = executorPool.submit(() -> { StringBuilder theResult = new StringBuilder(theDateFormat.format(theLastModified)); theResult.append(" - "); Highlighter theHighlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(theFinalQuery)); for (String theFragment : theHighlighter.getBestFragments(analyzer, theFieldName, theOriginalContent, NUMBER_OF_FRAGMENTS)) { if (theResult.length() > 0) { theResult = theResult.append("..."); } theResult = theResult.append(theFragment); } return theResult.toString(); }); int theNormalizedScore = (int) (theDocs.scoreDocs[i].score / theDocs.getMaxScore() * 5); File theFileOnDisk = new File(theFoundFileName); if (theFileOnDisk.exists()) { boolean thePreviewAvailable = previewProcessor.previewAvailableFor(theFileOnDisk); theExistingDocument = new QueryResultDocument(theDocumentID, theFoundFileName, theHighligherResult, Long.parseLong(theDocument.getField(IndexFields.LASTMODIFIED).stringValue()), theNormalizedScore, theUniqueID, thePreviewAvailable); theDocumentsByHash.put(theHash, theExistingDocument); theResultDocuments.add(theExistingDocument); } } } if (aConfiguration.isShowSimilarDocuments()) { MoreLikeThis theMoreLikeThis = new MoreLikeThis(theSearcher.getIndexReader()); theMoreLikeThis.setAnalyzer(analyzer); theMoreLikeThis.setMinTermFreq(1); theMoreLikeThis.setMinDocFreq(1); theMoreLikeThis.setFieldNames(analyzerCache.getAllFieldNames()); for (QueryResultDocument theDocument : theResultDocuments) { Query theMoreLikeThisQuery = theMoreLikeThis.like(theDocument.getDocumentID()); TopDocs theMoreLikeThisTopDocs = theSearcher.search(theMoreLikeThisQuery, 5); for (ScoreDoc theMoreLikeThisScoreDoc : theMoreLikeThisTopDocs.scoreDocs) { int theSimilarDocument = theMoreLikeThisScoreDoc.doc; if (theUniqueDocumentsFound.add(theSimilarDocument)) { Document theMoreLikeThisDocument = theSearcher.doc(theSimilarDocument); String theFilename = theMoreLikeThisDocument.getField(IndexFields.FILENAME) .stringValue(); theDocument.addSimilarFile(theFilename); } } } } LOGGER.info("Got Dimensions"); for (FacetResult theResult : theFacetCounts.getAllDims(20000)) { String theDimension = theResult.dim; if ("author".equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { theAuthorFacets.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } if ("extension".equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { theFileTypesFacets.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } if ("last-modified-year".equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { theLastModifiedYearFacet.add(new Facet(theLabelAndValue.label, theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } if (IndexFields.LANGUAGEFACET.equals(theDimension)) { for (LabelAndValue theLabelAndValue : theResult.labelValues) { if (!StringUtils.isEmpty(theLabelAndValue.label)) { Locale theLocale = new Locale(theLabelAndValue.label); theLanguageFacet.add(new Facet(theLocale.getDisplayLanguage(Locale.ENGLISH), theLabelAndValue.value.intValue(), aBasePath + "/" + encode( FacetSearchUtils.encode(theDimension, theLabelAndValue.label)))); } } } LOGGER.info(" " + theDimension); } if (!theAuthorFacets.isEmpty()) { theDimensions.add(new FacetDimension("Author", theAuthorFacets)); } if (!theLastModifiedYearFacet.isEmpty()) { theDimensions.add(new FacetDimension("Last modified", theLastModifiedYearFacet)); } if (!theFileTypesFacets.isEmpty()) { theDimensions.add(new FacetDimension("File types", theFileTypesFacets)); } if (!theLanguageFacet.isEmpty()) { theDimensions.add(new FacetDimension("Language", theLanguageFacet)); } // Wait for all Tasks to complete for the search result highlighter ForkJoinTask.helpQuiesce(); } long theDuration = System.currentTimeMillis() - theStartTime; LOGGER.info("Total amount of time : " + theDuration + "ms"); return new QueryResult(System.currentTimeMillis() - theStartTime, theResultDocuments, theDimensions, theSearcher.getIndexReader().numDocs(), aBacklink); } catch (Exception e) { throw new RuntimeException(e); } finally { searcherManager.release(theSearcher); } }
From source file:de.mirkosertic.desktopsearch.LuceneIndexHandler.java
License:Open Source License
public File getFileOnDiskForDocument(String aUniqueID) throws IOException { searcherManager.maybeRefreshBlocking(); IndexSearcher theSearcher = searcherManager.acquire(); try {//from ww w . j a v a 2 s. c o m TermQuery theTermQuery = new TermQuery(new Term(IndexFields.UNIQUEID, aUniqueID)); TopDocs theTopDocs = theSearcher.search(theTermQuery, null, 1); if (theTopDocs.totalHits == 1) { Document theDocument = theSearcher.doc(theTopDocs.scoreDocs[0].doc); if (theDocument != null) { return new File(theDocument.get(IndexFields.FILENAME)); } } return null; } finally { searcherManager.release(theSearcher); } }
From source file:de.tudarmstadt.ukp.dkpro.core.io.fangorn.FangornWriterTest.java
License:Apache License
@Test public void test() throws Exception { File outputFile = new File("target/test-output"); JCas jcas = JCasFactory.createJCas(); jcas.setDocumentLanguage("en"); jcas.setDocumentText("This is a test. I may work. Or it may not work."); DocumentMetaData meta = DocumentMetaData.create(jcas); meta.setCollectionId("dummyCollection"); meta.setDocumentId("dummyId"); AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class); AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class, OpenNlpParser.PARAM_WRITE_PENN_TREE, true); AnalysisEngineDescription writer = createEngineDescription(FangornWriter.class, FangornWriter.PARAM_TARGET_LOCATION, outputFile); SimplePipeline.runPipeline(jcas, segmenter, parser, writer); IndexSearcher searcher = new IndexSearcher(FSDirectory.getDirectory(outputFile)); QueryBuilder builder = new QueryBuilder("//NP"); TreebankQuery tq = builder.parse(TermJoinType.SIMPLE_WITH_FC, false); SimpleHitCollector hitCollector = new SimpleHitCollector(100); searcher.search(tq, hitCollector);/*from w ww.j a v a 2 s. c o m*/ AllResults allResults = new AllResults(hitCollector.hits, hitCollector.totalHits, tq); Result[] resultMeta = allResults.collect(searcher); String[] results = new String[hitCollector.totalHits]; for (int i = 0; i < hitCollector.totalHits; i++) { results[i] = searcher.doc(hitCollector.hits[i]).get("sent").trim(); } List<String> actual = new ArrayList<String>(); for (int i = 0; i < hitCollector.totalHits; i++) { Document doc = searcher.doc(hitCollector.hits[i]); actual.add(String.format("%s %s %s %s %s", doc.get(FangornWriter.FIELD_COLLECTION_ID), doc.get(FangornWriter.FIELD_DOCUMENT_ID), doc.get(FangornWriter.FIELD_BEGIN), doc.get(FangornWriter.FIELD_END), resultMeta[i].asJSONString().replace('"', '\''))); } List<String> expected = asList( "dummyCollection dummyId 0 15 {'num':'2','ms':[{'m':[{'s':'','e':'1_0_2_8','o':'0','t':'0'}]},{'m':[{'s':'','e':'4_2_3_6','o':'0','t':'0'}]}]}", "dummyCollection dummyId 16 27 {'num':'1','ms':[{'m':[{'s':'','e':'1_0_2_7','o':'0','t':'0'}]}]}", "dummyCollection dummyId 28 47 {'num':'1','ms':[{'m':[{'s':'','e':'2_1_2_9','o':'0','t':'0'}]}]}"); assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(actual, "\n")); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.clustering.debatefiltering.LuceneSearcher.java
License:Apache License
public List<String> retrieveTopNDocs(String textQuery, int topN) throws Exception { // Now search the index: Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); Directory directory = FSDirectory.open(luceneIndexDir); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); // Parse a simple query QueryParser parser = new QueryParser(Version.LUCENE_44, LuceneIndexer.FIELD_TEXT_CONTENT, analyzer); Query query = parser.parse(textQuery); ScoreDoc[] hits = indexSearcher.search(query, null, topN).scoreDocs; List<String> result = new ArrayList<>(); // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = indexSearcher.doc(hits[i].doc); result.add(hitDoc.getField(LuceneIndexer.FIELD_FILE).stringValue()); // System.out.println(hitDoc.toString()); // assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); }/*from ww w .j av a 2s .c o m*/ reader.close(); directory.close(); return result; }
From source file:de.twitterlivesearch.analysis.Searcher.java
License:Apache License
/** * This is the same as// w ww .ja va 2s. co m * {@link de.twitterlivesearch.analysis.Searcher#searchForTweets(String) * searchForTweets(String)}, but the search is limited to the tweet with the * given id. This can for example be used to analyze the latest incoming * tweet. * * @param id * @param queryString * @return */ public List<Document> searchForTweets(Integer id, String queryString) { if (queryString.isEmpty()) { return Collections.emptyList(); } AbstractConfiguration config = ConfigurationHolder.getConfiguration(); try { if (!DirectoryReader.indexExists(directory)) { return null; } } catch (IOException e) { log.fatal("Error when trying to check if directory exists!", e); return new ArrayList<>(); } DirectoryReader ireader; try { ireader = DirectoryReader.open(directory); } catch (IOException e) { log.fatal("Error when trying to open directory!", e); return null; } IndexSearcher isearcher = new IndexSearcher(ireader); Query textQuery = null; QueryParser parser = new QueryParser(FieldNames.TEXT.getField(), AnalyzerMapping.getInstance().ANALYZER_FOR_DELIMITER); parser.setDefaultOperator(config.getDefaultOperator()); BooleanQuery query = new BooleanQuery(); try { textQuery = parser.parse(queryString); } catch (ParseException e) { log.fatal("Error while parsing query: " + queryString, e); } // if id does not equal null only the query with the given id will be // searched // this can be used to search the latest element only if (id != null) { Query idQuery = NumericRangeQuery.newIntRange(FieldNames.ID.getField(), id.intValue(), id.intValue(), true, true); query.add(idQuery, Occur.MUST); } query.add(textQuery, Occur.MUST); ScoreDoc[] hits = null; try { hits = isearcher.search(query, 1000).scoreDocs; } catch (IOException e) { log.fatal("Error while trying to search!", e); } List<Document> result = new ArrayList<>(); for (int i = 0; i < hits.length; i++) { try { result.add(isearcher.doc(hits[i].doc)); log.info("Found result for query \"" + queryString + "\"."); } catch (IOException e) { log.fatal("Error when getting document!", e); } } return result; }
From source file:de.unidue.inf.is.ezdl.dlservices.search.handlers.ranking.LuceneRanker.java
License:Open Source License
private void calculateRSVs(Directory directory, ResultDocumentList documentList, DocumentQuery documentQuery) throws ParseException, CorruptIndexException, IOException { Query query = new QueryParser(Version.LUCENE_31, de.unidue.inf.is.ezdl.dlcore.data.fields.Field.TEXT.toString(), new SimpleAnalyzer(Version.LUCENE_31)).parse(queryConverter.convert(documentQuery.getQuery())); IndexSearcher searcher = new IndexSearcher(directory, true); TopDocs topDocs = searcher.search(query, 1000); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { String oid = searcher.doc(scoreDoc.doc).get("oid"); float score = scoreDoc.score; ResultDocument document = getDocumentByOid(documentList, oid); if (document != null) { document.setUnnormalizedRsv(score); }/* www. j a va 2 s.c o m*/ } }
From source file:de.uni_koeln.spinfo.maalr.lucene.core.Dictionary.java
License:Apache License
private QueryResult toQueryResult(TopDocs docs, int startIndex, int pageSize) throws NoIndexAvailableException, BrokenIndexException, IOException, InvalidTokenOffsetsException { final ArrayList<LemmaVersion> results = new ArrayList<LemmaVersion>(pageSize); final ScoreDoc[] scoreDocs = docs.scoreDocs; IndexSearcher searcher = indexProvider.getSearcher(); for (int i = startIndex; i < scoreDocs.length && i < startIndex + pageSize; i++) { Document doc = searcher.doc(scoreDocs[i].doc); LemmaVersion e = indexManager.getLemmaVersion(doc); results.add(e);// ww w . j a v a 2 s. c om } return new QueryResult(results, docs.totalHits, pageSize); }
From source file:Demo1.MyServlet.java
private void gotoSearch(PrintWriter out, HttpServletRequest request, HttpServletResponse response) { try {/* w w w. j a v a 2 s. c o m*/ // Text to search String querystr = request.getParameter("keyword"); log.addHistory(querystr); // The \"title\" arg specifies the default field to use when no field is explicitly specified in the query Query q = new QueryParser("Searching", analyzer).parse(querystr); // Searching code int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // Code to display the results of search //out.println("Found " + hits.length + " Classes Matching your Requirement"); courseList = new ArrayList(); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); Land course = new Land(d.get("name"), d.get("price"), d.get("area"), d.get("purpose")); //out.println((i + 1) + ". " + d.get("Number")+ d.get("Classes") ); courseList.add(course); } request.setAttribute("Lands", courseList); RequestDispatcher de = request.getRequestDispatcher("/table.jsp"); de.forward(request, response); // reader can only be closed when there is no need to access the documents any more reader.close(); } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:Demo2.MyServlet.java
private void gotoSearch(PrintWriter out, HttpServletRequest request, HttpServletResponse response) { try {// www . j a va 2 s .c o m // Text to search String querystr = request.getParameter("keyword"); log.addHistory(querystr); // The \"title\" arg specifies the default field to use when no field is explicitly specified in the query Query q = new QueryParser("Classes", analyzer).parse(querystr); // Searching code int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // Code to display the results of search //out.println("Found " + hits.length + " Classes Matching your Requirement"); courseList = new ArrayList(); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); Child course = new Child(d.get("Number"), d.get("Classes"), d.get("Time"), d.get("Department")); //out.println((i + 1) + ". " + d.get("Number")+ d.get("Classes") ); courseList.add(course); } request.setAttribute("course", courseList); RequestDispatcher de = request.getRequestDispatcher("/table.jsp"); de.forward(request, response); // reader can only be closed when there is no need to access the documents any more reader.close(); } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:di.uniba.it.tri.shell.Command.java
License:Open Source License
private void search(String cmd) throws Exception { String[] split = cmd.split("\\s+"); if (split.length > 2) { if (reader == null) { throw new Exception("no index in memory"); } else {// w ww.j a v a 2 s .co m if (!split[1].matches("[0-9]+")) { throw new Exception("no valid number of results"); } StringBuilder qs = new StringBuilder(); for (int i = 2; i < split.length; i++) { qs.append(split[i]).append(" "); } //String q = QueryParser.escape(qs.toString().trim()); Query query = parser.parse(qs.toString().trim()); IndexSearcher searcher = new IndexSearcher(reader); TopDocs topDocs = searcher.search(query, Integer.parseInt(split[1])); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { TriShell.print(searcher.doc(scoreDoc.doc).get("word")); TriShell.print("\t"); TriShell.println(String.valueOf(scoreDoc.score)); } } } else { throw new Exception("search syntax error"); } }