org.northshore.cbri.sent.SentenceDetector_cTAKES.java Source code

Java tutorial

Introduction

Here is the source code for org.northshore.cbri.sent.SentenceDetector_cTAKES.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.northshore.cbri.sent;

import groovy.lang.GroovyShell;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import opennlp.model.AbstractModel;
import opennlp.tools.sentdetect.DefaultSDContextGenerator;
import opennlp.tools.sentdetect.EndOfSentenceScanner;
import opennlp.tools.sentdetect.SDContextGenerator;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.StringUtil;

import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.codehaus.groovy.control.CompilationFailedException;
import org.codehaus.groovy.control.CompilerConfiguration;

import com.google.common.base.Charsets;
import com.google.common.io.Resources;

public class SentenceDetector_cTAKES extends JCasAnnotator_ImplBase {

    private static class SentenceSpan {

        public static String LF = "\n";
        public static String CR = "\r";
        public static String CRLF = "\r\n";

        private int start; // offset of text within larger text
        private int end; // offset of end of text within larger text
        private String text;

        public SentenceSpan(int s, int e, String t) {
            start = s;
            end = e;
            text = t;
        }

        public int getStart() {
            return start;
        }

        public int getEnd() {
            return end;
        }

        /**
         * Trim any leading or trailing whitespace.
         * If there are any end-of-line characters in what's left, split into multiple smaller sentences,
         * and trim each.
         * If is entirely whitespace, return an empty list
         * @param separatorPattern CR LF or CRLF
         */
        public List<SentenceSpan> splitAtLineBreaksAndTrim(String separatorPattern) {

            ArrayList<SentenceSpan> subspans = new ArrayList<SentenceSpan>();

            // Validate input parameter
            if (!separatorPattern.equals(LF) && !separatorPattern.equals(CR) && !separatorPattern.equals(CRLF)) {

                int len = separatorPattern.length();
                System.err.println("Invalid line break: " + len + " characters long.");

                System.err.print("        line break character values: ");
                for (int i = 0; i < len; i++) {
                    System.err.print(Integer.valueOf(separatorPattern.charAt(i)));
                    System.err.print(" "); // print a space between values
                }
                System.err.println();

                //System.err.println("Invalid line break: \\0x" + Byte.parseByte(separatorPattern.getBytes("US-ASCII").toString(),16));
                subspans.add(this);
                return subspans;
            }

            // Check first if contains only whitespace, in which case return an empty list
            String coveredText = text.substring(0, end - start);
            String trimmedText = coveredText.trim();
            int trimmedLen = trimmedText.length();
            if (trimmedLen == 0) {
                return subspans;
            }

            // If there is any leading or trailing whitespace, determine position of the trimmed section
            int positionOfNonWhiteSpace = 0;

            // Split into multiple sentences if contains end-of-line characters
            // or return just one sentence if no end-of-line characters are within the trimmed string
            String spans[] = coveredText.split(separatorPattern);
            int position = start;
            for (String s : spans) {
                String t = s.trim();
                if (t.length() > 0) {
                    positionOfNonWhiteSpace = s.indexOf(t.charAt(0));
                } else {
                    positionOfNonWhiteSpace = 0;
                }
                // Might have trimmed off some at the beginning of the sentences other than the 1st (#0)
                position += positionOfNonWhiteSpace; // sf Bugs artifact 3083903: For _each_ sentence, advance past any spaces at beginning of line
                subspans.add(new SentenceSpan(position, position + t.length(), t));
                position += (s.length() - positionOfNonWhiteSpace + separatorPattern.length());
            }

            return subspans;

        }
    }

    /**
     * 
     * @author wthompso
     *
     */
    private static class EndOfSentenceScannerImpl implements EndOfSentenceScanner {

        private static final char[] eosCandidates = { '.', '!', ')', ']', '>', '\"', ':', ';' }; // CTAKES-227

        public EndOfSentenceScannerImpl() {
            super();
        }

        public char[] getEndOfSentenceCharacters() {
            return eosCandidates;
        }

        public List<Integer> getPositions(String s) {
            return getPositions(s.toCharArray());
        }

        public List<Integer> getPositions(StringBuffer sb) {
            return getPositions(sb.toString().toCharArray());
        }

        public List<Integer> getPositions(char[] cb) {
            List<Integer> positions = new ArrayList<Integer>();

            for (int i = 0; i < cb.length; i++) { // for each character in buffer
                for (int j = 0; j < eosCandidates.length; j++) { // for each eosCandidate
                    if (cb[i] == eosCandidates[j]) {
                        positions.add(new Integer(i)); // TODO - don't always create new, use a pool
                        break; // can't match others if it matched eosCandidates[j]
                    }
                }
            }

            return positions;
        }
    }

    //----------------------------------------------------------------------------------------------------------------
    // The main class definition starts here
    //----------------------------------------------------------------------------------------------------------------

    public static final String SD_MODEL_FILE_PARAM = "sentenceModelFile";
    public static final String SD_SEGMENTS_TO_PARSE = "segmentsToParse";

    @ConfigurationParameter(name = SD_MODEL_FILE_PARAM, mandatory = true, description = "File holding sentence model")
    private String sentenceModelFile;

    @ConfigurationParameter(name = SD_SEGMENTS_TO_PARSE, mandatory = false, description = "Script providing input segments")
    private String segmentsToParse;

    private opennlp.tools.sentdetect.SentenceModel sdmodel;
    private SDContextGenerator cgen;
    private EndOfSentenceScanner scanner;

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {

        super.initialize(aContext);

        try {
            InputStream modelIn = Resources.getResource(sentenceModelFile).openStream();
            this.sdmodel = new SentenceModel(modelIn);
            this.scanner = new EndOfSentenceScannerImpl();
            this.cgen = new DefaultSDContextGenerator(this.scanner.getEndOfSentenceCharacters());
        } catch (Exception ace) {
            throw new ResourceInitializationException(ace);
        }
    }

    @Override
    public void process(JCas jcas) throws AnalysisEngineProcessException {

        //------------------------------------------------------------------------------------------------------------
        // TODO: this is experimental code on injecting a Groovy script to determine
        // which Segments should be run through the sentence detector annotator.
        Collection<Segment> segs = null;
        try {
            if (segmentsToParse != null) {
                CompilerConfiguration config = new CompilerConfiguration();
                config.setScriptBaseClass("org.northshore.cbri.dsl.UIMAUtil");
                ////Binding binding = new Binding();
                GroovyShell shell = new GroovyShell(config);

                System.out.println("GroovyAnnotator loading script file: " + this.segmentsToParse);
                URL url = Resources.getResource(this.segmentsToParse);
                String scriptContents = Resources.toString(url, Charsets.UTF_8);
                segs = (Collection<Segment>) shell.evaluate(scriptContents);
            } else {
                segs = JCasUtil.select(jcas, Segment.class);
            }
        } catch (CompilationFailedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        //------------------------------------------------------------------------------------------------------------

        String text = jcas.getDocumentText();

        for (Segment seg : segs) {
            annotateRange(jcas, text, seg, 0);
        }
    }

    // ---------------------------------------------------------------------------------------------------------
    // Private methods
    // ---------------------------------------------------------------------------------------------------------

    /**
     * Detect sentences within a section of the text and add annotations to the
     * CAS. Uses OpenNLP sentence detector, and then additionally forces
     * sentences to end at end-of-line characters (splitting into multiple
     * sentences). Also trims sentences. And if the sentence detector does
     * happen to form a sentence that is just white space, it will be ignored.
     * 
     * @param jcas
     *            view of the CAS containing the text to run sentence detector
     *            against
     * @param text
     *            the document text
     * @param section
     *            the section this sentence is in
     * @param sentenceCount
     *            the number of sentences added already to the CAS (if
     *            processing one section at a time)
     * @return count The sum of <code>sentenceCount</code> and the number of
     *         Sentence annotations added to the CAS for this section
     * @throws AnnotatorProcessException
     */
    private int annotateRange(JCas jcas, String text, Segment section, int sentenceCount)
            throws AnalysisEngineProcessException {

        int sectBegin = section.getBegin();
        int sectEnd = section.getEnd();

        // Use OpenNLP tools to split text into sentences
        // The sentence detector returns the offsets of the sentence-endings it
        // detects within the string
        int[] sentenceBreaks = sentPosDetect(text.substring(sectBegin, sectEnd));

        int numSentences = sentenceBreaks.length;
        // There might be text after the last sentence-ending found by detector,
        // so +1
        SentenceSpan[] potentialSentSpans = new SentenceSpan[numSentences + 1];

        int sentStart = sectBegin;
        int sentEnd = sectBegin;

        // Start by filling in sentence spans from what OpenNLP tools detected
        // Will trim leading or trailing whitespace when check for end-of-line
        // characters
        for (int i = 0; i < numSentences; i++) {
            sentEnd = sentenceBreaks[i] + sectBegin;
            String coveredText = text.substring(sentStart, sentEnd);
            potentialSentSpans[i] = new SentenceSpan(sentStart, sentEnd, coveredText);
            sentStart = sentEnd;
        }

        // If detector didn't find any sentence-endings,
        // or there was text after the last sentence-ending found,
        // create a sentence from what's left, as long as it's not all
        // whitespace.
        // Will trim leading or trailing whitespace when check for end-of-line
        // characters
        if (sentEnd < sectEnd) {
            String coveredText = text.substring(sentEnd, sectEnd);
            if (coveredText.trim() != "") {
                potentialSentSpans[numSentences] = new SentenceSpan(sentEnd, sectEnd, coveredText);
                numSentences++;
            }
        }

        // Copy potentialSentSpans into sentenceSpans,
        // ignoring any that are entirely whitespace,
        // trimming the rest,
        // and splitting any of those that contain an end-of-line character.
        // Then trim any leading or trailing whitespace of ones that were split.
        ArrayList<SentenceSpan> sentenceSpans = new ArrayList<SentenceSpan>(0);
        for (int i = 0; i < potentialSentSpans.length; i++) {
            if (potentialSentSpans[i] != null) {
                sentenceSpans.addAll(potentialSentSpans[i].splitAtLineBreaksAndTrim("\n"));
            }
        }

        // Add sentence annotations to the CAS
        int previousEnd = -1;
        for (int i = 0; i < sentenceSpans.size(); i++) {
            SentenceSpan span = sentenceSpans.get(i);
            if (span.getStart() != span.getEnd()) { // skip empty lines
                Sentence sa = new Sentence(jcas);
                sa.setBegin(span.getStart());
                sa.setEnd(span.getEnd());
                if (previousEnd <= sa.getBegin()) {
                    sa.setSentenceNumber(sentenceCount);
                    sa.addToIndexes();
                    sentenceCount++;
                    previousEnd = span.getEnd();
                }
            }
        }
        return sentenceCount;
    }

    private int getFirstWS(String s, int pos) {
        while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos)))
            pos++;
        return pos;
    }

    private int getFirstNonWS(String s, int pos) {
        while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos)))
            pos++;
        return pos;
    }

    /**
     * Detect the position of the first words of sentences in a String.
     * 
     * @param s
     *            The string to be processed.
     * @return A integer array containing the positions of the end index of
     *         every sentence
     * 
     * @see SentenceDetectorME#sentPosDetect(String)
     */
    private int[] sentPosDetect(String s) {
        StringBuffer sb = new StringBuffer(s);
        List<Integer> enders = this.scanner.getPositions(s);
        List<Integer> positions = new ArrayList<Integer>(enders.size());

        for (int i = 0, end = enders.size(), index = 0; i < end; i++) {
            Integer candidate = enders.get(i);
            int cint = candidate;
            // skip over the leading parts of non-token final delimiters
            int fws = getFirstWS(s, cint + 1);
            if (i + 1 < end && enders.get(i + 1) < fws) {
                continue;
            }

            AbstractModel model = this.sdmodel.getMaxentModel();
            double[] probs = model.eval(cgen.getContext(sb, cint));
            String bestOutcome = model.getBestOutcome(probs);
            if (bestOutcome.equals("s")) {
                if (index != cint) {
                    positions.add(getFirstNonWS(s, cint));
                }
                index = cint + 1;
            }
        }

        int[] sentenceBreaks = new int[positions.size()];
        for (int i = 0; i < sentenceBreaks.length; i++) {
            sentenceBreaks[i] = positions.get(i) + 1;
        }

        return sentenceBreaks;
    }
}