Example usage for opennlp.tools.util Span getCoveredText

Introduction

In this page you can find the example usage for opennlp.tools.util Span getCoveredText.

Prototype

public CharSequence getCoveredText(CharSequence text)

Source Link

Document

Retrieves the string covered by the current span of the specified text.

Usage

From source file:edu.stanford.muse.index.NER.java

/**
 * triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive).
 * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String)
 *///from  w  w w.  j  av a  2 s .c om
private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets(
        String documentText) {
    try {
        NER.initialize();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }

    if (documentText.indexOf("\u00A0") > 0)
        documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages.
    // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER?
    if (REMOVE_I18N_CHARS)
        documentText = cleanI18NChars(documentText);

    List<Pair<String, String>> namedEntities = new ArrayList<Pair<String, String>>(); // token-type pairs
    List<Triple<String, Integer, Integer>> allTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs

    Span sentenceSpans[] = sFinder.sentPosDetect(documentText); // do NER sentence by sentence -- much faster than doing the entire documentText at once

    for (Span sentenceSpan : sentenceSpans) {
        int sentenceStartOffset = sentenceSpan.getStart();
        String sentence = sentenceSpan.getCoveredText(documentText).toString();
        if (sentence.length() > 2000)
            continue; // that's not a reasonable sentence, could be a uuencoded-something.

        // convert sentence to tokens cos that's what the name finders need
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();

        // find the actual spans (in terms of tokens) that represent names
        Span[] pSpans = pFinder.find(tokens);
        Span[] lSpans = lFinder.find(tokens);
        Span[] oSpans = oFinder.find(tokens);
        List<Triple<String, Integer, Integer>> sentenceTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs

        for (Span span : pSpans)
            sentenceTriples.add(new Triple<String, Integer, Integer>("PERSON", span.getStart(), span.getEnd()));
        for (Span span : lSpans)
            sentenceTriples
                    .add(new Triple<String, Integer, Integer>("LOCATION", span.getStart(), span.getEnd()));
        for (Span span : oSpans)
            sentenceTriples
                    .add(new Triple<String, Integer, Integer>("ORGANIZATION", span.getStart(), span.getEnd()));

        for (Triple<String, Integer, Integer> t : sentenceTriples) {
            String type = t.first();
            if (type == null)
                type = "UNKNOWN"; // we see type = null sometimes #!@#$
            allTypes.add(type);
            int startTok = t.second();
            int endTok = t.third();

            String namedEntity = sentence.substring(tokSpans[startTok].getStart(),
                    tokSpans[endTok - 1].getEnd());
            // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already.

            if (namedEntity.toLowerCase().startsWith("hi "))
                namedEntity = namedEntity.substring("hi ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("hello "))
                namedEntity = namedEntity.substring("hello ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("dear "))
                namedEntity = namedEntity.substring("dear ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("cheers "))
                namedEntity = namedEntity.substring("cheers ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("thanks "))
                namedEntity = namedEntity.substring("thanks ".length()).trim();

            if (DictUtils.tabooNames.contains(namedEntity.toLowerCase()))
                continue;
            if (!nameFilterPass(namedEntity))
                continue;

            if (namedEntity.length() < MIN_NAME_LENGTH || namedEntity.length() > MAX_NAME_LENGTH) // drop it
                continue;
            namedEntities.add(new Pair<String, String>(namedEntity, type));
            if (log.isDebugEnabled())
                log.debug(t.first() + " : [" + t.second() + ":" + t.third() + "] " + namedEntity);
        }

        // sentence triple offsets cannot be used directly ... have to be first converted to the right offset within the entire document by adding sentenceStartOffset
        for (Triple<String, Integer, Integer> t : sentenceTriples) {
            int startTok = t.second();
            int endTok = t.third();
            int start = tokSpans[startTok].getStart(), end = tokSpans[endTok - 1].getEnd();

            //allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + t.getSecond(), sentenceStartOffset + t.getThird()));
            allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + start,
                    sentenceStartOffset + end));
        }
    }

    return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(namedEntities),
            allTriples);
}

From source file:org.dbpedia.spotlight.spot.OpenNLPNGramSpotter.java

/**Extracts noun-phrase n-grams from the given piece of input text. 
 * @param text  A Text object containing the input from where to extract NP n-grams
 * @return A list of SurfaceFormOccurrence objects.
 *//*from  ww w.j  ava 2s.  c  om*/
protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
    String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel) tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel) posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel) chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(intext).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        // System.out.println("\n\nTokens:");
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
            // System.out.println(tokens[i]);
        }
        String[] tags = posTagger.tag(tokens);
        Span[] chunks = chunker.chunkAsSpans(tokens, tags);
        for (Span chunk : chunks) {
            if ("NP".equals(chunk.getType())) {
                //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
                //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
                //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
                //to compute the actual start/end offsets from the begining of the input text.
                int begin = tokSpans[chunk.getStart()].getStart();
                int end = tokSpans[chunk.getEnd() - 1].getEnd();
                List<Map<String, Integer>> ngrampos = extractNGramPos(chunk.getStart(), chunk.getEnd() + -1);
                extractNGrams(ngrampos, start, text, tokSpans, npNgramSFLst);
            }
        }
    }
    return npNgramSFLst;
}

From source file:org.wso2.uima.collectionProccesingEngine.analysisEngines.LocationIdentifier.java

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    String text = jcas.getDocumentText();
    Span[] sentSpans = sentenceDetector.sentPosDetect(jcas.getDocumentText());

    for (Span sentSpan : sentSpans) {
        String sentence = sentSpan.getCoveredText(text).toString();
        int start = sentSpan.getStart();
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = new String[tokSpans.length];
        for (int i = 0; i < tokens.length; i++) {
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        }//w  w w . ja  v  a  2  s  .  c o  m

        logger.debug("Tweet Text: " + jcas.getDocumentText());
        Span locationSpans[] = locationFinder.find(tokens);
        LocationIdentification annotation = new LocationIdentification(jcas);
        for (Span location : locationSpans) {
            annotation.setBegin(start + tokSpans[location.getStart()].getStart());
            annotation.setEnd(start + tokSpans[location.getEnd() - 1].getEnd());
            annotation.addToIndexes(jcas);
            logger.info("Location Detected : " + annotation.getCoveredText());
        }

        if (locationSpans.length == 0) {
            logger.info("Location Unable to be Detected");
        }

    }
}