edu.cmu.lti.oaqa.baseqa.passage.retrieval.LuceneInMemorySentenceRetrievalExecutor.java Source code

Introduction

Here is the source code for edu.cmu.lti.oaqa.baseqa.passage.retrieval.LuceneInMemorySentenceRetrievalExecutor.java
Source

/*
 * Open Advancement Question Answering (OAQA) Project Copyright 2016 Carnegie Mellon University
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations
 * under the License.
 */

package edu.cmu.lti.oaqa.baseqa.passage.retrieval;

import com.aliasi.sentences.IndoEuropeanSentenceModel;
import com.aliasi.sentences.SentenceChunker;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import edu.cmu.lti.oaqa.baseqa.providers.query.BooleanBagOfPhraseQueryStringConstructor;
import edu.cmu.lti.oaqa.baseqa.providers.query.QueryStringConstructor;
import edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil;
import edu.cmu.lti.oaqa.baseqa.util.UimaContextHelper;
import edu.cmu.lti.oaqa.type.retrieval.AbstractQuery;
import edu.cmu.lti.oaqa.type.retrieval.Passage;
import edu.cmu.lti.oaqa.util.TypeUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.RAMDirectory;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toMap;

/**
 * <p>
 *   This is an unsupervised {@link Passage} candidate generator and reranker.
 *   It first splits each {@link Document} into {@link Passage}s at the sentence level, and then
 *   build an in-memory Lucene index for all the sentences.
 *   Relevant {@link Passage}s are retrieved by a query string, translated by a
 *   {@link QueryStringConstructor} from the {@link AbstractQuery}.
 * </p>
 * <p>
 *   A supervised version that uses the retrieval scores of the sentences as features is implemented
 *   in {@link edu.cmu.lti.oaqa.baseqa.passage.rerank.scorers.LuceneInMemoryPassageScorer}.
 * </p>
 *
 * @see edu.cmu.lti.oaqa.baseqa.passage.rerank.scorers.LuceneInMemoryPassageScorer
 *
 * @author <a href="mailto:ziy@cs.cmu.edu">Zi Yang</a> created on 10/19/14
 */
public class LuceneInMemorySentenceRetrievalExecutor extends JCasAnnotator_ImplBase {

    private SentenceChunker chunker;

    private Analyzer analyzer;

    private int hits;

    private QueryParser parser;

    private QueryStringConstructor queryStringConstructor;

    private static final Logger LOG = LoggerFactory.getLogger(LuceneInMemorySentenceRetrievalExecutor.class);

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        // initialize sentence chunker
        TokenizerFactory tokenizerFactory = UimaContextHelper.createObjectFromConfigParameter(context,
                "tokenizer-factory", "tokenizer-factory-params", IndoEuropeanTokenizerFactory.class,
                TokenizerFactory.class);
        SentenceModel sentenceModel = UimaContextHelper.createObjectFromConfigParameter(context, "sentence-model",
                "sentence-model-params", IndoEuropeanSentenceModel.class, SentenceModel.class);
        chunker = new SentenceChunker(tokenizerFactory, sentenceModel);
        // initialize hits
        hits = UimaContextHelper.getConfigParameterIntValue(context, "hits", 200);
        // initialize query analyzer, index writer config, and query parser
        analyzer = UimaContextHelper.createObjectFromConfigParameter(context, "query-analyzer",
                "query-analyzer-params", StandardAnalyzer.class, Analyzer.class);
        parser = new QueryParser("text", analyzer);
        // initialize query string constructor
        queryStringConstructor = UimaContextHelper.createObjectFromConfigParameter(context,
                "query-string-constructor", "query-string-constructor-params",
                BooleanBagOfPhraseQueryStringConstructor.class, QueryStringConstructor.class);
    }

    @Override
    public void process(JCas jcas) throws AnalysisEngineProcessException {
        // create lucene documents for all sentences in all sections
        Map<Integer, Passage> hash2passage = TypeUtil.getRankedPassages(jcas).stream()
                .flatMap(sec -> RetrievalUtil.extractSentences(jcas, sec, chunker).stream())
                .collect(toMap(TypeUtil::hash, Function.identity(), (x, y) -> y));
        List<Document> luceneDocs = hash2passage.values().stream().map(RetrievalUtil::createLuceneDocument)
                .collect(toList());
        // create lucene index
        RAMDirectory index = new RAMDirectory();
        try (IndexWriter writer = new IndexWriter(index, new IndexWriterConfig(analyzer))) {
            writer.addDocuments(luceneDocs);
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
        // search in the index
        AbstractQuery aquery = TypeUtil.getAbstractQueries(jcas).stream().findFirst().get();
        Map<Integer, Float> hash2score = new HashMap<>();
        try (IndexReader reader = DirectoryReader.open(index)) {
            IndexSearcher searcher = new IndexSearcher(reader);
            String queryString = queryStringConstructor.construct(aquery);
            LOG.info("Search for query: {}", queryString);
            Query query = parser.parse(queryString);
            ScoreDoc[] scoreDocs = searcher.search(query, hits).scoreDocs;
            for (ScoreDoc scoreDoc : scoreDocs) {
                float score = scoreDoc.score;
                int hash;
                hash = Integer.parseInt(searcher.doc(scoreDoc.doc).get("hash"));
                hash2score.put(hash, score);
            }
        } catch (IOException | ParseException e) {
            throw new AnalysisEngineProcessException(e);
        }
        // add to CAS
        hash2score.entrySet().stream().map(entry -> {
            Passage passage = hash2passage.get(entry.getKey());
            passage.setScore(entry.getValue());
            return passage;
        }).sorted(Comparator.comparing(Passage::getScore).reversed()).forEach(Passage::addToIndexes);
    }

}