edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.lti.oaqa.baseqa.passage.RetrievalUtil.java

Source

/*
 * Open Advancement Question Answering (OAQA) Project Copyright 2016 Carnegie Mellon University
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations
 * under the License.
 */

package edu.cmu.lti.oaqa.baseqa.passage;

import com.aliasi.chunk.Chunking;
import com.aliasi.sentences.SentenceChunker;
import edu.cmu.lti.oaqa.type.retrieval.Document;
import edu.cmu.lti.oaqa.type.retrieval.Passage;
import edu.cmu.lti.oaqa.util.TypeFactory;
import edu.cmu.lti.oaqa.util.TypeUtil;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.uima.jcas.JCas;

import java.util.ArrayList;
import java.util.List;
import java.util.stream.IntStream;

import static java.util.stream.Collectors.toList;

/**
 * A utility class for retrieval related operations.
 *
 * @author <a href="mailto:ziy@cs.cmu.edu">Zi Yang</a> created on 10/19/14
 */
public class RetrievalUtil {

    public static List<Passage> extractSections(JCas jcas, Document doc) {
        String uri = doc.getUri();
        String docId = doc.getDocId();
        String[] sections = doc.getSections().toArray();
        String[] sectionLabels = doc.getSectionLabels().toArray();
        assert sections.length == sectionLabels.length;
        return IntStream.range(0, sections.length).mapToObj(i -> {
            String t = sections[i];
            String label = sectionLabels[i];
            return TypeFactory.createPassage(jcas, uri, t, docId, 0, t.length(), label, label);
        }).collect(toList());
    }

    public static List<Passage> extractSentences(JCas jcas, Passage passage, SentenceChunker chunker) {
        String text = passage.getText();
        String uri = passage.getUri();
        String docId = passage.getDocId();
        String beginSection = passage.getBeginSection();
        String endSection = passage.getEndSection();
        Chunking chunking = chunker.chunk(text.toCharArray(), 0, text.length());
        return chunking.chunkSet().stream().map(chunk -> {
            int begin = chunk.start();
            int end = chunk.end();
            String t = text.substring(begin, end);
            return TypeFactory.createPassage(jcas, uri, t, docId, begin, end, beginSection, endSection);
        }).collect(toList());

    }

    public static List<Passage> extractAbstractSection(JCas jcas, Document doc) {
        String uri = doc.getUri();
        String docId = doc.getDocId();
        String[] sections = doc.getSections().toArray();
        String[] sectionLabels = doc.getSectionLabels().toArray();
        assert sections.length == sectionLabels.length;
        return IntStream.range(0, 1).mapToObj(i -> {
            String t = sections[i];
            String label = sectionLabels[i];
            return TypeFactory.createPassage(jcas, uri, t, docId, 0, t.length(), label, label);
        }).collect(toList());
    }

    public static org.apache.lucene.document.Document createLuceneDocument(Passage passage) {
        org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
        entry.add(new StoredField("hash", TypeUtil.hash(passage)));
        entry.add(new TextField("text", passage.getText(), Field.Store.NO));
        return entry;
    }

    public static org.apache.lucene.document.Document createLuceneSectionDocument(Passage passage) {
        org.apache.lucene.document.Document entry = new org.apache.lucene.document.Document();
        entry.add(new StoredField("hash", TypeUtil.hash(passage)));
        entry.add(new TextField(passage.getBeginSection(), passage.getText(), Field.Store.NO));
        return entry;
    }

    public static List<Passage> extractTitleAbstract(JCas jcas, Document doc) {
        String uri = doc.getUri();
        String docId = doc.getDocId();
        String title = doc.getTitle();
        String text = doc.getText();

        return extractTitleAbstract(jcas, doc, uri, docId, title, text);
    }

    public static List<Passage> extractTitleAbstract(JCas jcas, Document doc, String uri, String docId,
            String title, String text) {

        List<Passage> passages = new ArrayList<>();
        if (title != null) {
            passages.add(TypeFactory.createPassage(jcas, uri, title, docId, 0, title.length(), "title", "title"));
        }
        if (text != null) {
            passages.add(
                    TypeFactory.createPassage(jcas, uri, text, docId, 0, text.length(), "abstract", "abstract"));
        }
        return passages;
    }

}