Example usage for opennlp.tools.namefind NameFinderME probs

List of usage examples for opennlp.tools.namefind NameFinderME probs

Introduction

In this page you can find the example usage for opennlp.tools.namefind NameFinderME probs.

Prototype

public double[] probs() 

Source Link

Document

Returns an array with the probabilities of the last decoded sequence.

Usage

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

/**
 * THis method extracts NamedEntity occurrences by using existing {@link Token}s and 
 * {@link Sentence}s in the parsed {@link AnalysedText}.
 * @param nameFinderModel the model used to find NamedEntities
 * @param at the Analysed Text/*  w  ww. j  av a  2  s . c o  m*/
 * @param language the language of the text
 * @return the found named Entity Occurrences
 */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) { //no sentence annotations
        sentences.add(at); //process as a single section
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        //three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(),
                sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());

        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext();) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
                    tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            //create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context,
                    confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            //add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            //TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }/*from  w w w .  java  2 s  .  co  m*/
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");

        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                    tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) { //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(),
                    context, confidence);

            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();//from w  w  w . j  a v  a2  s . c o m
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}