List of usage examples for org.apache.lucene.search IndexSearcher doc
public Document doc(int docID) throws IOException
.getIndexReader().document(docID)
From source file:edu.cmu.lti.oaqa.baseqa.concept.rerank.LuceneInMemoryConceptReranker.java
License:Apache License
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { List<ConceptSearchResult> results = TypeUtil.getRankedConceptSearchResults(jcas); // calculate field scores Map<String, ConceptSearchResult> uri2result = results.stream().collect(toMap(ConceptSearchResult::getUri, Function.identity(), (r1, r2) -> r1.getScore() > r2.getScore() ? r1 : r2)); List<Document> luceneDocs = results.stream().map(LuceneInMemoryConceptReranker::toLuceneDocument) .collect(toList());//from ww w . j a v a 2 s .c o m RAMDirectory index = new RAMDirectory(); try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) { writer.addDocuments(luceneDocs); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).iterator().next(); String queryString = queryStringConstructor.construct(aquery); LOG.info("Query string: {}", queryString); Map<String, Float> uri2score = new HashMap<>(); try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); Query query = parser.parse(queryString); ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { uri2score.put(searcher.doc(scoreDoc.doc).get("uri"), scoreDoc.score); } } catch (IOException | ParseException e) { throw new AnalysisEngineProcessException(e); } // calculate score for (Map.Entry<String, ConceptSearchResult> entry : uri2result.entrySet()) { String uri = entry.getKey(); ConceptSearchResult result = entry.getValue(); double score = uri2score.getOrDefault(uri, 0F) * weight + result.getScore(); result.setScore(score); } TypeUtil.rankedSearchResultsByScore(results, limit); LOG.info("Reranked {} concepts.", uri2score.size()); if (LOG.isDebugEnabled()) { results.stream().sorted(TypeUtil.SEARCH_RESULT_RANK_COMPARATOR).limit(20).map(TypeUtil::toString) .forEachOrdered(s -> LOG.debug(" - {}", s)); } }
From source file:edu.cmu.lti.oaqa.baseqa.document.rerank.LogRegDocumentReranker.java
License:Apache License
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { /*/*from www. ja v a 2 s .c om*/ * ("arthritis"[MeSH Terms] OR "arthritis"[All Fields]) * AND common[All Fields] AND ("men"[MeSH Terms] OR "men"[All Fields])) OR ("women"[MeSH Terms] OR "women"[All Fields]) */ // calculate field scores List<Document> documents = TypeUtil.getRankedDocuments(jcas); Map<String, Document> id2doc = documents.stream().collect(toMap(Document::getDocId, Function.identity())); List<org.apache.lucene.document.Document> luceneDocs = documents.stream() .map(LogRegDocumentReranker::toLuceneDocument).collect(toList()); RAMDirectory index = new RAMDirectory(); try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) { writer.addDocuments(luceneDocs); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).iterator().next(); String queryString = queryStringConstructor.construct(aquery); LOG.info("Search for query: {}", queryString); Map<String, Float> id2titleScore = new HashMap<>(); Map<String, Float> id2textScore = new HashMap<>(); try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); Query titleQuery = parser.createBooleanQuery("title", queryString); ScoreDoc[] titleScoreDocs = searcher.search(titleQuery, hits).scoreDocs; LOG.info(" - Title matches: {}", titleScoreDocs.length); for (ScoreDoc titleScoreDoc : titleScoreDocs) { id2titleScore.put(searcher.doc(titleScoreDoc.doc).get("id"), titleScoreDoc.score); } Query textQuery = parser.createBooleanQuery("text", queryString); ScoreDoc[] textScoreDocs = searcher.search(textQuery, hits).scoreDocs; LOG.info(" - Text matches: {}", textScoreDocs.length); for (ScoreDoc textScoreDoc : textScoreDocs) { id2textScore.put(searcher.doc(textScoreDoc.doc).get("id"), textScoreDoc.score); } } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // set score for (Map.Entry<String, Document> entry : id2doc.entrySet()) { String id = entry.getKey(); Document doc = entry.getValue(); doc.setScore(calculateScore(doc.getRank(), id2titleScore.getOrDefault(id, 0f), id2textScore.getOrDefault(id, 0f))); } TypeUtil.rankedSearchResultsByScore(documents, hits); }
From source file:edu.cmu.lti.oaqa.baseqa.passage.retrieval.ImprovedLuceneInMemorySentenceRetrievalExecutor.java
License:Apache License
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { // create lucene documents for all sentences in all sections and delete the duplicate ones Map<Integer, Passage> hash2passage = new HashMap<Integer, Passage>(); for (Passage d : TypeUtil.getRankedPassages(jcas)) { for (Passage s : RetrievalUtil.extractSentences(jcas, d, chunker)) { if (!hash2passage.containsKey(TypeUtil.hash(s))) { hash2passage.put(TypeUtil.hash(s), s); }// w w w.j a v a 2 s. co m } } // remove the documents from pipeline TypeUtil.getRankedPassages(jcas).forEach(Passage::removeFromIndexes); List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument) .collect(toList()); // create lucene index RAMDirectory index = new RAMDirectory(); try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) { writer.addDocuments(luceneDocs); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // search in the index AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get(); Map<Integer, Float> hash2score = new HashMap<>(); try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); String queryString = queryStringConstructor.construct(aquery).replace("\"", " ").replace("/", " ") .replace("[", " ").replace("]", " "); LOG.info("Search for query: {}", queryString); // construct the query Query query = parser.parse(queryString); LOG.trace(query.toString()); searcher.setSimilarity(new BM25Similarity()); ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { float score = scoreDoc.score; int hash; hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash")); hash2score.put(hash, score); } } catch (IOException | ParseException e) { throw new AnalysisEngineProcessException(e); } LOG.info("The size of Returned Sentences: {}", hash2score.size()); // add to CAS hash2score.entrySet().stream().map(entry -> { Passage passage = hash2passage.get(entry.getKey()); passage.setScore(entry.getValue()); return passage; }).sorted(Comparator.comparing(Passage::getScore).reversed()).forEach(Passage::addToIndexes); Collection<Passage> snippets = TypeUtil.getRankedPassages(jcas); // rank the snippet and add them to pipeline rankSnippets(jcas, calSkip(jcas, hash2passage), calBM25(jcas, hash2passage), calAlignment(jcas, hash2passage), calSentenceLength(hash2passage), hash2passage); }
From source file:edu.cmu.lti.oaqa.baseqa.passage.retrieval.ImprovedLuceneInMemorySentenceRetrievalExecutor.java
License:Apache License
private Map<Integer, Float> calBM25(JCas jcas, Map<Integer, Passage> hash2passage) throws AnalysisEngineProcessException { // index the documents using lucene List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument) .collect(toList());/* w w w . j a va 2s.com*/ // create lucene index RAMDirectory index = new RAMDirectory(); try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) { writer.addDocuments(luceneDocs); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // search in the index AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get(); Map<Integer, Float> hash2score = new HashMap<>(); try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); String queryString = queryStringConstructor.construct(aquery).replace("\"", " ").replace("/", " ") .replace("[", " ").replace("]", " "); LOG.info("Search for query: {}", queryString); // construct the query Query query = parser.parse(queryString); searcher.setSimilarity(new BM25Similarity()); ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { float score = scoreDoc.score; int hash; hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash")); hash2score.put(hash, score); } } catch (IOException | ParseException e) { throw new AnalysisEngineProcessException(e); } return hash2score; }
From source file:edu.cmu.lti.oaqa.baseqa.passage.retrieval.LuceneInMemorySentenceRetrievalExecutor.java
License:Apache License
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { // create lucene documents for all sentences in all sections Map<Integer, Passage> hash2passage = TypeUtil.getRankedPassages(jcas).stream() .flatMap(sec -> RetrievalUtil.extractSentences(jcas, sec, chunker).stream()) .collect(toMap(TypeUtil::hash, Function.identity(), (x, y) -> y)); List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument) .collect(toList());/*from w w w .ja v a 2 s. c o m*/ // create lucene index RAMDirectory index = new RAMDirectory(); try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) { writer.addDocuments(luceneDocs); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // search in the index AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get(); Map<Integer, Float> hash2score = new HashMap<>(); try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); String queryString = queryStringConstructor.construct(aquery); LOG.info("Search for query: {}", queryString); Query query = parser.parse(queryString); ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { float score = scoreDoc.score; int hash; hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash")); hash2score.put(hash, score); } } catch (IOException | ParseException e) { throw new AnalysisEngineProcessException(e); } // add to CAS hash2score.entrySet().stream().map(entry -> { Passage passage = hash2passage.get(entry.getKey()); passage.setScore(entry.getValue()); return passage; }).sorted(Comparator.comparing(Passage::getScore).reversed()).forEach(Passage::addToIndexes); }
From source file:edu.coeia.tasks.ChatLoadingTask.java
License:Open Source License
private void displayChatSessionFast() throws IOException { try {// w w w. ja v a2 s . c o m Directory directory = FSDirectory .open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation())); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.CHAT_AGENT, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse(panel.getAgent()); TopDocs topDocs = searcher.search(query, 5000); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { final Document document = searcher.doc(scoreDoc.doc); String chatFile = document.get(IndexingConstant.CHAT_FILE); if (chatFile != null && !chatFile.trim().isEmpty()) { if (chatFile.endsWith(this.fileName)) { EventQueue.invokeLater(new Runnable() { @Override public void run() { ChatItem item = (ChatItem) ItemFactory.newInstance(document, panel.getCaseFacade(), false); Object[] data = new Object[] { item.getFrom(), item.getTo(), item.getMessageText(), item.getDate() }; JTableUtil.addRowToJTable(panel.getTable(), data); } }); } } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:edu.coeia.tasks.ChatRefreshTask.java
License:Open Source License
private Set<String> getChatFilePathFast() throws IOException { Set<String> result = new HashSet<String>(); try {//from w w w . ja v a2 s . c om Directory directory = FSDirectory .open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation())); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.CHAT_AGENT, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse(panel.getAgent()); TopDocs topDocs = searcher.search(query, 5000); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String chatFile = document.get(IndexingConstant.CHAT_FILE); if (chatFile != null && !chatFile.trim().isEmpty()) { chatFile = this.panel.getCaseFacade().getFullPath(chatFile); final File path = new File(chatFile); result.add(path.getName()); } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } return result; }
From source file:edu.coeia.tasks.EmailLoadingTask.java
License:Open Source License
private void getAllEmailMessagesFast(final String path, final String constant, final String type) throws IOException { List<Integer> ids = new ArrayList<Integer>(); try {// w w w. j a va 2 s . co m Directory directory = FSDirectory .open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation())); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse("email"); TopDocs topDocs = searcher.search(query, 100000); for (ScoreDoc scoreDocs : topDocs.scoreDocs) { Document document = searcher.doc(scoreDocs.doc); String emailPath = document.get(constant); if (emailPath != null && !emailPath.trim().isEmpty()) { if (emailPath.endsWith(path)) { final EmailItem item = (EmailItem) ItemFactory.newInstance(document, panel.getCaseFacade(), false); EventQueue.invokeLater(new Runnable() { @Override public void run() { JTableUtil.addRowToJTable(panel.getTable(), item.getFullDisplayData()); } }); ids.add(Integer.valueOf(item.getDocumentId())); } } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } this.panel.setResultIds(ids); }
From source file:edu.coeia.tasks.EmailRefreshTask.java
License:Open Source License
private Set<String> getOfflineEmailsPaths() throws IOException { Set<String> result = new HashSet<String>(); Directory directory = FSDirectory.open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation())); IndexSearcher searcher = new IndexSearcher(directory); Query query = new TermQuery(new Term(IndexingConstant.DOCUMENT_TYPE, "email")); TopDocs topDocs = searcher.search(query, 5000); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String offlineEmailPath = document.get(IndexingConstant.OFFLINE_EMAIL_PATH); if (offlineEmailPath != null && !offlineEmailPath.trim().isEmpty()) { result.add(offlineEmailPath); }//from w w w . j a v a 2 s .c o m } searcher.close(); return result; }
From source file:edu.coeia.tasks.ExtensionFrequencyTask.java
License:Open Source License
private Map<String, Double> getExtensionFreqFast() throws IOException { Map<String, Double> map = new HashMap<String, Double>(); try {/*from ww w . j av a2 s . com*/ Directory directory = FSDirectory.open(new File(this.caseFacade.getCaseIndexFolderLocation())); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse("file"); TopDocs topDocs = searcher.search(query, 100000); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String filePath = document.get(IndexingConstant.FILE_PATH); if (filePath != null && !filePath.trim().isEmpty()) { final File path = new File(filePath); String ext = FileUtil.getExtension(path); if (ext == null || ext.length() > 6) // no more extension than 5 character! continue; ext = ext.toLowerCase(); if (map.get(ext) == null) { map.put(ext, 1.0); } else map.put(ext, map.get(ext) + 1); } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } return map; }