Example usage for opennlp.tools.namefind NameFinderME NameFinderME

List of usage examples for opennlp.tools.namefind NameFinderME NameFinderME

Introduction

In this page you can find the example usage for opennlp.tools.namefind NameFinderME NameFinderME.

Prototype

public NameFinderME(TokenNameFinderModel model) 

Source Link

Usage

From source file:com.screenslicer.core.nlp.Person.java

public static String extractName(String src, boolean strict, boolean dictionaryOnly) {
    NameFinderME nameFinder = new NameFinderME(nameModel);
    String[] sentences = NlpUtil.sentences(src);
    Collection<String> nlpNames = new HashSet<String>();
    Collection<String> nlpFallbacks = new HashSet<String>();
    Collection<String> dictionaryNames = new HashSet<String>();
    Collection<String> dictionaryFallbacks = new HashSet<String>();
    for (int i = 0; i < sentences.length; i++) {
        String[] tokens = NlpUtil.tokensFromSentence(sentences[i]);
        for (int j = 0; j < tokens.length; j++) {
            String first = tokens[j];
            String last = null;//from  w  ww.  ja  v a2 s  . co m
            if (j + 1 < tokens.length) {
                last = tokens[j + 1];
            }
            if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                dictionaryNames.add(first + " " + last);
            } else if (!strict && isFirstName(first, strict)) {
                dictionaryFallbacks.add(first);
            }
        }
        Span[] spans = nameFinder.find(tokens);
        for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
            List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
            for (String curName : curNames) {
                if (curName.contains(" ") && isFullName(curName, strict)) {
                    nlpNames.add(curName);
                } else if (isFirstName(curName, strict)) {
                    nlpFallbacks.add(curName);
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }

    if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) {
        nlpNames.clear();
        nlpFallbacks.clear();
        dictionaryNames.clear();
        dictionaryFallbacks.clear();
        nameFinder.clearAdaptiveData();
        for (int s = 0; s < sentences.length; s++) {
            String[] tokens = sentences[s].split("[\\W\\s]|$|^");
            for (int i = 0; i < tokens.length; i++) {
                String first = tokens[i];
                String last = null;
                if (i + 1 < tokens.length) {
                    last = tokens[i + 1];
                }
                if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                    dictionaryNames.add(first + " " + last);
                } else if (!strict && isFirstName(first, strict)) {
                    dictionaryFallbacks.add(first);
                }
            }
            Span[] spans = nameFinder.find(tokens);
            for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
                List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
                for (String curName : curNames) {
                    if (curName.contains(" ") && isFullName(curName, strict)) {
                        nlpNames.add(curName);
                    } else if (isFirstName(curName, strict)) {
                        nlpFallbacks.add(curName);
                    }
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }
    if (nlpNames.size() == 1) {
        return nlpNames.iterator().next();
    }
    if (nlpFallbacks.size() == 1) {
        return nlpFallbacks.iterator().next();
    }
    if (dictionaryNames.size() == 1) {
        return dictionaryNames.iterator().next();
    }
    if (dictionaryFallbacks.size() == 1) {
        return dictionaryFallbacks.iterator().next();
    }
    return null;
}

From source file:edu.stanford.muse.index.NER.java

public synchronized static void initialize() throws ClassCastException, IOException, ClassNotFoundException {
    if (pFinder != null)
        return;//from ww w.j a  v a2  s  .c  o m
    long startTimeMillis = System.currentTimeMillis();
    log.info("Initializing NER models");

    try {
        InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        pFinder = new NameFinderME(pmodel);

        InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        lFinder = new NameFinderME(lmodel);

        InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        oFinder = new NameFinderME(omodel);
    }
    //dont bother about this, instead try not to use it
    catch (Exception e) {
        Util.print_exception(e, log);
    }
    try {
        InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin");
        SentenceModel model = new SentenceModel(modelIn);
        sFinder = new SentenceDetectorME(model);

        InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        tokenizer = new TokenizerME(modelTokenizer);
    } catch (Exception e) {
        Util.print_exception(e);
    }

    long endTimeMillis = System.currentTimeMillis();
    log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

private void findEntities(Entities namedEntities, List<TextAnnotation> allTextAnnotations, String[] tokens) {
    for (Map.Entry<String, TokenNameFinderModel> finderEntry : tokenNameFinders.entrySet()) {
        String type = finderEntry.getKey();
        NameFinderME finder = new NameFinderME(finderEntry.getValue());
        try {//from w ww  . j a v  a2  s . co  m
            Span[] spans = finder.find(tokens);
            double[] probs = finder.probs(spans);

            for (int ni = 0; ni < spans.length; ni++) {
                allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
            }
        } finally {
            finder.clearAdaptiveData();
        }
    }

    if (allTextAnnotations.size() > 0) {
        removeConflicts(allTextAnnotations);
    }

    convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
}

From source file:edu.stanford.muse.index.NER.java

public static void testOpenNLP() {

    try {// w w w . j  a  v a 2s  .  c  o m
        String s = Util.readFile("/tmp/in");
        /*
        List<Pair<String,Float>> pairs = NER.namesFromText(s);
        for (Pair<String,Float> p: pairs) {
           System.out.println (p);
        }
        System.out.println ("-----");
        */

        InputStream pis = Config.getResourceAsStream("en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        InputStream lis = Config.getResourceAsStream("en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        InputStream ois = Config.getResourceAsStream("en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        InputStream tokenStream = Config.getResourceAsStream("en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        TokenizerME tokenizer = new TokenizerME(modelTokenizer);
        Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]);

        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd());

        NameFinderME pFinder = new NameFinderME(pmodel);
        Span[] pSpans = pFinder.find(tokens);
        NameFinderME lFinder = new NameFinderME(lmodel);
        Span[] lSpans = lFinder.find(tokens);
        NameFinderME oFinder = new NameFinderME(omodel);
        Span[] oSpans = oFinder.find(tokens);
        System.out.println("Names found:");
        for (Span span : pSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Locations found:");
        for (Span span : lSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Orgs found:");
        for (Span span : oSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:org.apache.stanbol.commons.opennlp.OpenNLP.java

/**
 * Getter for the {@link TokenNameFinder} for the parsed entity type and language.
 * @param type the type of the named entities to find (person, organization)
 * @param language the language//from   w  w w. j a  v  a 2 s  .co  m
 * @return the model or <code>null</code> if no model data are found
 * @throws InvalidFormatException in case the found model data are in the wrong format
 * @throws IOException on any error while reading the model data
 */
public TokenNameFinder getNameFinder(String type, String language) throws IOException {
    TokenNameFinderModel model = getNameModel(type, language);
    if (model != null) {
        return new NameFinderME(model);
    } else {
        log.debug("TokenNameFinder model for type {} and langauge {} not present", type, language);
        return null;
    }
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

/**
 * THis method extracts NamedEntity occurrences by using existing {@link Token}s and 
 * {@link Sentence}s in the parsed {@link AnalysedText}.
 * @param nameFinderModel the model used to find NamedEntities
 * @param at the Analysed Text/*from ww w. j a  v  a2s  . com*/
 * @param language the language of the text
 * @return the found named Entity Occurrences
 */
protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        AnalysedText at, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    List<Section> sentences = new ArrayList<Section>();
    //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
    AnalysedTextUtils.appandToList(at.getSentences(), sentences);
    if (sentences.isEmpty()) { //no sentence annotations
        sentences.add(at); //process as a single section
    }
    for (int i = 0; i < sentences.size(); i++) {
        String sentence = sentences.get(i).getSpan();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        contextElements.add(sentence);
        //three sentences as context
        String context = at.getSpan().substring(sentences.get(Math.max(0, i - 1)).getStart(),
                sentences.get(Math.min(sentences.size() - 1, i + 1)).getEnd());

        // get the tokens, words of the current sentence
        List<Token> tokens = new ArrayList<Token>(32);
        List<String> words = new ArrayList<String>(32);
        for (Iterator<Token> it = sentences.get(i).getTokens(); it.hasNext();) {
            Token t = it.next();
            tokens.add(t);
            words.add(t.getSpan());
        }
        Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
                    tokens.get(nameSpans[j].getEnd() - 1).getEnd());
            Double confidence = 1.0;
            for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                confidence *= probs[k];
            }
            int start = tokens.get(nameSpans[j].getStart()).getStart();
            int end = start + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            //create the occurrence for writing fise:TextAnnotations
            NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(), context,
                    confidence);
            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
            //add also the NerAnnotation to the AnalysedText
            Chunk chunk = at.addChunk(start, end);
            //TODO: build AnnotationModel based on the configured Mappings
            chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }//from  w  w w .j a va  2s .  com
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");

        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                    tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) { //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(),
                    context, confidence);

            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();//  w w  w. j  a  v a2s.com
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}

From source file:org.wso2.uima.collectionProccesingEngine.analysisEngines.LocationIdentifier.java

@Override
public void initialize(UimaContext ctx) throws ResourceInitializationException {
    super.initialize(ctx);
    InputStream sentenceStream = null;
    InputStream tokenizerStream = null;
    InputStream nameFinderStream = null;
    try {//from  w  w w. j  a  v a2  s.c  o m
        sentenceStream = getContext().getResourceAsStream("SentenceModel");
        SentenceModel sentenceModel = new SentenceModel(sentenceStream);
        sentenceDetector = new SentenceDetectorME(sentenceModel);
        sentenceStream.close();
        tokenizerStream = getContext().getResourceAsStream("TokenizerModel");
        TokenizerModel tokenModel = new TokenizerModel(tokenizerStream);
        tokenizer = new TokenizerME(tokenModel);
        tokenizerStream.close();
        nameFinderStream = getContext().getResourceAsStream("TokenNameFinderModel");
        TokenNameFinderModel nameFinderModel = new TokenNameFinderModel(nameFinderStream);
        locationFinder = new NameFinderME(nameFinderModel);
        nameFinderStream.close();
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    } finally {
        IOUtils.closeQuietly(nameFinderStream);
        IOUtils.closeQuietly(tokenizerStream);
        IOUtils.closeQuietly(sentenceStream);
        logger.info(LocationIdentifier.class.getSimpleName() + " Analysis Engine initialized successfully");
    }
}