Example usage for org.apache.lucene.search IndexSearcher doc

List of usage examples for org.apache.lucene.search IndexSearcher doc

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher doc.

Prototype

public Document doc(int docID) throws IOException 

Source Link

Document

Sugar for .getIndexReader().document(docID)

Usage

From source file:edu.cmu.lti.oaqa.baseqa.concept.rerank.LuceneInMemoryConceptReranker.java

License:Apache License

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    List<ConceptSearchResult> results = TypeUtil.getRankedConceptSearchResults(jcas);
    // calculate field scores
    Map<String, ConceptSearchResult> uri2result = results.stream().collect(toMap(ConceptSearchResult::getUri,
            Function.identity(), (r1, r2) -> r1.getScore() > r2.getScore() ? r1 : r2));
    List<Document> luceneDocs = results.stream().map(LuceneInMemoryConceptReranker::toLuceneDocument)
            .collect(toList());//from  ww  w  .  j a  v a  2  s  .c  o m
    RAMDirectory index = new RAMDirectory();
    try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) {
        writer.addDocuments(luceneDocs);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
    AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).iterator().next();
    String queryString = queryStringConstructor.construct(aquery);
    LOG.info("Query string: {}", queryString);
    Map<String, Float> uri2score = new HashMap<>();
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        Query query = parser.parse(queryString);
        ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            uri2score.put(searcher.doc(scoreDoc.doc).get("uri"), scoreDoc.score);
        }
    } catch (IOException | ParseException e) {
        throw new AnalysisEngineProcessException(e);
    }
    // calculate score
    for (Map.Entry<String, ConceptSearchResult> entry : uri2result.entrySet()) {
        String uri = entry.getKey();
        ConceptSearchResult result = entry.getValue();
        double score = uri2score.getOrDefault(uri, 0F) * weight + result.getScore();
        result.setScore(score);
    }
    TypeUtil.rankedSearchResultsByScore(results, limit);
    LOG.info("Reranked {} concepts.", uri2score.size());
    if (LOG.isDebugEnabled()) {
        results.stream().sorted(TypeUtil.SEARCH_RESULT_RANK_COMPARATOR).limit(20).map(TypeUtil::toString)
                .forEachOrdered(s -> LOG.debug(" - {}", s));
    }
}

From source file:edu.cmu.lti.oaqa.baseqa.document.rerank.LogRegDocumentReranker.java

License:Apache License

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    /*/*from www. ja  v a  2 s  .c  om*/
    * ("arthritis"[MeSH Terms] OR "arthritis"[All Fields])
    *  AND common[All Fields] AND ("men"[MeSH Terms] OR "men"[All Fields])) OR ("women"[MeSH Terms] OR "women"[All Fields])
    */
    // calculate field scores
    List<Document> documents = TypeUtil.getRankedDocuments(jcas);
    Map<String, Document> id2doc = documents.stream().collect(toMap(Document::getDocId, Function.identity()));
    List<org.apache.lucene.document.Document> luceneDocs = documents.stream()
            .map(LogRegDocumentReranker::toLuceneDocument).collect(toList());
    RAMDirectory index = new RAMDirectory();
    try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) {
        writer.addDocuments(luceneDocs);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
    AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).iterator().next();
    String queryString = queryStringConstructor.construct(aquery);
    LOG.info("Search for query: {}", queryString);
    Map<String, Float> id2titleScore = new HashMap<>();
    Map<String, Float> id2textScore = new HashMap<>();
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new BM25Similarity());
        Query titleQuery = parser.createBooleanQuery("title", queryString);
        ScoreDoc[] titleScoreDocs = searcher.search(titleQuery, hits).scoreDocs;
        LOG.info(" - Title matches: {}", titleScoreDocs.length);
        for (ScoreDoc titleScoreDoc : titleScoreDocs) {
            id2titleScore.put(searcher.doc(titleScoreDoc.doc).get("id"), titleScoreDoc.score);
        }
        Query textQuery = parser.createBooleanQuery("text", queryString);
        ScoreDoc[] textScoreDocs = searcher.search(textQuery, hits).scoreDocs;
        LOG.info(" - Text matches: {}", textScoreDocs.length);
        for (ScoreDoc textScoreDoc : textScoreDocs) {
            id2textScore.put(searcher.doc(textScoreDoc.doc).get("id"), textScoreDoc.score);
        }
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
    // set score
    for (Map.Entry<String, Document> entry : id2doc.entrySet()) {
        String id = entry.getKey();
        Document doc = entry.getValue();
        doc.setScore(calculateScore(doc.getRank(), id2titleScore.getOrDefault(id, 0f),
                id2textScore.getOrDefault(id, 0f)));
    }
    TypeUtil.rankedSearchResultsByScore(documents, hits);
}

From source file:edu.cmu.lti.oaqa.baseqa.passage.retrieval.ImprovedLuceneInMemorySentenceRetrievalExecutor.java

License:Apache License

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    // create lucene documents for all sentences in all sections and delete the duplicate ones
    Map<Integer, Passage> hash2passage = new HashMap<Integer, Passage>();
    for (Passage d : TypeUtil.getRankedPassages(jcas)) {
        for (Passage s : RetrievalUtil.extractSentences(jcas, d, chunker)) {
            if (!hash2passage.containsKey(TypeUtil.hash(s))) {
                hash2passage.put(TypeUtil.hash(s), s);
            }// w  w  w.j  a  v a 2  s. co m
        }
    }
    // remove the documents from pipeline
    TypeUtil.getRankedPassages(jcas).forEach(Passage::removeFromIndexes);
    List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument)
            .collect(toList());
    // create lucene index
    RAMDirectory index = new RAMDirectory();
    try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) {
        writer.addDocuments(luceneDocs);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
    // search in the index
    AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get();
    Map<Integer, Float> hash2score = new HashMap<>();
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryString = queryStringConstructor.construct(aquery).replace("\"", " ").replace("/", " ")
                .replace("[", " ").replace("]", " ");
        LOG.info("Search for query: {}", queryString);

        // construct the query
        Query query = parser.parse(queryString);
        LOG.trace(query.toString());
        searcher.setSimilarity(new BM25Similarity());
        ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            float score = scoreDoc.score;
            int hash;
            hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash"));
            hash2score.put(hash, score);
        }
    } catch (IOException | ParseException e) {
        throw new AnalysisEngineProcessException(e);
    }
    LOG.info("The size of Returned Sentences: {}", hash2score.size());
    // add to CAS
    hash2score.entrySet().stream().map(entry -> {
        Passage passage = hash2passage.get(entry.getKey());
        passage.setScore(entry.getValue());
        return passage;
    }).sorted(Comparator.comparing(Passage::getScore).reversed()).forEach(Passage::addToIndexes);

    Collection<Passage> snippets = TypeUtil.getRankedPassages(jcas);

    // rank the snippet and add them to pipeline
    rankSnippets(jcas, calSkip(jcas, hash2passage), calBM25(jcas, hash2passage),
            calAlignment(jcas, hash2passage), calSentenceLength(hash2passage), hash2passage);

}

From source file:edu.cmu.lti.oaqa.baseqa.passage.retrieval.ImprovedLuceneInMemorySentenceRetrievalExecutor.java

License:Apache License

private Map<Integer, Float> calBM25(JCas jcas, Map<Integer, Passage> hash2passage)
        throws AnalysisEngineProcessException {
    // index the documents using lucene
    List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument)
            .collect(toList());/*  w w  w . j a  va 2s.com*/
    // create lucene index
    RAMDirectory index = new RAMDirectory();
    try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) {
        writer.addDocuments(luceneDocs);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
    // search in the index
    AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get();
    Map<Integer, Float> hash2score = new HashMap<>();
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryString = queryStringConstructor.construct(aquery).replace("\"", " ").replace("/", " ")
                .replace("[", " ").replace("]", " ");
        LOG.info("Search for query: {}", queryString);

        // construct the query
        Query query = parser.parse(queryString);
        searcher.setSimilarity(new BM25Similarity());
        ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            float score = scoreDoc.score;
            int hash;
            hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash"));
            hash2score.put(hash, score);
        }
    } catch (IOException | ParseException e) {
        throw new AnalysisEngineProcessException(e);
    }
    return hash2score;
}

From source file:edu.cmu.lti.oaqa.baseqa.passage.retrieval.LuceneInMemorySentenceRetrievalExecutor.java

License:Apache License

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    // create lucene documents for all sentences in all sections
    Map<Integer, Passage> hash2passage = TypeUtil.getRankedPassages(jcas).stream()
            .flatMap(sec -> RetrievalUtil.extractSentences(jcas, sec, chunker).stream())
            .collect(toMap(TypeUtil::hash, Function.identity(), (x, y) -> y));
    List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument)
            .collect(toList());/*from   w w w  .ja  v  a  2  s. c o m*/
    // create lucene index
    RAMDirectory index = new RAMDirectory();
    try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) {
        writer.addDocuments(luceneDocs);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
    // search in the index
    AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get();
    Map<Integer, Float> hash2score = new HashMap<>();
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryString = queryStringConstructor.construct(aquery);
        LOG.info("Search for query: {}", queryString);
        Query query = parser.parse(queryString);
        ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            float score = scoreDoc.score;
            int hash;
            hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash"));
            hash2score.put(hash, score);
        }
    } catch (IOException | ParseException e) {
        throw new AnalysisEngineProcessException(e);
    }
    // add to CAS
    hash2score.entrySet().stream().map(entry -> {
        Passage passage = hash2passage.get(entry.getKey());
        passage.setScore(entry.getValue());
        return passage;
    }).sorted(Comparator.comparing(Passage::getScore).reversed()).forEach(Passage::addToIndexes);
}

From source file:edu.coeia.tasks.ChatLoadingTask.java

License:Open Source License

private void displayChatSessionFast() throws IOException {
    try {//  w  w w. ja v a2  s  . c o m
        Directory directory = FSDirectory
                .open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation()));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.CHAT_AGENT,
                new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(panel.getAgent());

        TopDocs topDocs = searcher.search(query, 5000);

        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            final Document document = searcher.doc(scoreDoc.doc);
            String chatFile = document.get(IndexingConstant.CHAT_FILE);

            if (chatFile != null && !chatFile.trim().isEmpty()) {

                if (chatFile.endsWith(this.fileName)) {

                    EventQueue.invokeLater(new Runnable() {
                        @Override
                        public void run() {
                            ChatItem item = (ChatItem) ItemFactory.newInstance(document, panel.getCaseFacade(),
                                    false);
                            Object[] data = new Object[] { item.getFrom(), item.getTo(), item.getMessageText(),
                                    item.getDate() };
                            JTableUtil.addRowToJTable(panel.getTable(), data);
                        }
                    });

                }
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:edu.coeia.tasks.ChatRefreshTask.java

License:Open Source License

private Set<String> getChatFilePathFast() throws IOException {
    Set<String> result = new HashSet<String>();

    try {//from  w  w  w . ja  v a2  s . c  om
        Directory directory = FSDirectory
                .open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation()));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.CHAT_AGENT,
                new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(panel.getAgent());

        TopDocs topDocs = searcher.search(query, 5000);

        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            String chatFile = document.get(IndexingConstant.CHAT_FILE);

            if (chatFile != null && !chatFile.trim().isEmpty()) {
                chatFile = this.panel.getCaseFacade().getFullPath(chatFile);
                final File path = new File(chatFile);
                result.add(path.getName());
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    return result;
}

From source file:edu.coeia.tasks.EmailLoadingTask.java

License:Open Source License

private void getAllEmailMessagesFast(final String path, final String constant, final String type)
        throws IOException {
    List<Integer> ids = new ArrayList<Integer>();

    try {//  w  w  w. j  a va 2  s  . co m
        Directory directory = FSDirectory
                .open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation()));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE,
                new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse("email");

        TopDocs topDocs = searcher.search(query, 100000);

        for (ScoreDoc scoreDocs : topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDocs.doc);
            String emailPath = document.get(constant);

            if (emailPath != null && !emailPath.trim().isEmpty()) {

                if (emailPath.endsWith(path)) {
                    final EmailItem item = (EmailItem) ItemFactory.newInstance(document, panel.getCaseFacade(),
                            false);

                    EventQueue.invokeLater(new Runnable() {
                        @Override
                        public void run() {
                            JTableUtil.addRowToJTable(panel.getTable(), item.getFullDisplayData());
                        }
                    });

                    ids.add(Integer.valueOf(item.getDocumentId()));
                }
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    this.panel.setResultIds(ids);
}

From source file:edu.coeia.tasks.EmailRefreshTask.java

License:Open Source License

private Set<String> getOfflineEmailsPaths() throws IOException {
    Set<String> result = new HashSet<String>();

    Directory directory = FSDirectory.open(new File(this.panel.getCaseFacade().getCaseIndexFolderLocation()));

    IndexSearcher searcher = new IndexSearcher(directory);

    Query query = new TermQuery(new Term(IndexingConstant.DOCUMENT_TYPE, "email"));
    TopDocs topDocs = searcher.search(query, 5000);

    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        Document document = searcher.doc(scoreDoc.doc);
        String offlineEmailPath = document.get(IndexingConstant.OFFLINE_EMAIL_PATH);

        if (offlineEmailPath != null && !offlineEmailPath.trim().isEmpty()) {
            result.add(offlineEmailPath);
        }//from  w  w  w  .  j  a  v  a  2 s  .c o  m
    }

    searcher.close();

    return result;
}

From source file:edu.coeia.tasks.ExtensionFrequencyTask.java

License:Open Source License

private Map<String, Double> getExtensionFreqFast() throws IOException {
    Map<String, Double> map = new HashMap<String, Double>();

    try {/*from   ww w .  j av  a2 s  .  com*/
        Directory directory = FSDirectory.open(new File(this.caseFacade.getCaseIndexFolderLocation()));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE,
                new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse("file");

        TopDocs topDocs = searcher.search(query, 100000);

        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            String filePath = document.get(IndexingConstant.FILE_PATH);

            if (filePath != null && !filePath.trim().isEmpty()) {
                final File path = new File(filePath);
                String ext = FileUtil.getExtension(path);

                if (ext == null || ext.length() > 6) // no more extension than 5 character!
                    continue;

                ext = ext.toLowerCase();

                if (map.get(ext) == null) {
                    map.put(ext, 1.0);
                } else
                    map.put(ext, map.get(ext) + 1);
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    return map;
}