Example usage for opennlp.tools.util Span spansToStrings

List of usage examples for opennlp.tools.util Span spansToStrings

Introduction

In this page you can find the example usage for opennlp.tools.util Span spansToStrings.

Prototype

public static String[] spansToStrings(Span[] spans, String[] tokens) 

Source Link

Usage

From source file:hrpod.tools.nlp.NLPTools.java

public String[] tokenize(String text) {

    String[] chunkStrings = null;

    try {//from  w  ww . jav a  2  s . co m
        TokenizerME wordBreaker = new TokenizerME(getTokenModel());
        POSTaggerME posme = new POSTaggerME(getPosModel());
        ChunkerME chunkerME = new ChunkerME(getChunkerModel());

        //words is the tokenized sentence
        String[] words = wordBreaker.tokenize(text);
        //posTags are the parts of speech of every word in the sentence (The chunker needs this info)
        String[] posTags = posme.tag(words);
        //chunks are the start end "spans" indices to the chunks in the words array
        Span[] chunks = chunkerME.chunkAsSpans(words, posTags);
        //chunkStrings are the actual chunks
        chunkStrings = Span.spansToStrings(chunks, words);
        //for (int i = 0; i < chunks.length; i++) {
        //    if (chunks[i].getType().equals("NP")) {
        //        System.out.println("NP: \n\t" + chunkStrings[i]);

        //String[] split = chunkStrings[i].split(" ");
        //List<String> ngrams = ngram(Arrays.asList(split), N, " ");
        //System.out.println("ngrams:");
        //for (String gram : ngrams) {
        //  System.out.println("\t" + gram);
        //}
        //}
        //}
    } catch (Exception e) {
        logger.error("Error in tokenize", e);
    }

    return chunkStrings;

}

From source file:com.screenslicer.core.nlp.Person.java

public static String extractName(String src, boolean strict, boolean dictionaryOnly) {
    NameFinderME nameFinder = new NameFinderME(nameModel);
    String[] sentences = NlpUtil.sentences(src);
    Collection<String> nlpNames = new HashSet<String>();
    Collection<String> nlpFallbacks = new HashSet<String>();
    Collection<String> dictionaryNames = new HashSet<String>();
    Collection<String> dictionaryFallbacks = new HashSet<String>();
    for (int i = 0; i < sentences.length; i++) {
        String[] tokens = NlpUtil.tokensFromSentence(sentences[i]);
        for (int j = 0; j < tokens.length; j++) {
            String first = tokens[j];
            String last = null;/*from www. ja  va2 s  . co  m*/
            if (j + 1 < tokens.length) {
                last = tokens[j + 1];
            }
            if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                dictionaryNames.add(first + " " + last);
            } else if (!strict && isFirstName(first, strict)) {
                dictionaryFallbacks.add(first);
            }
        }
        Span[] spans = nameFinder.find(tokens);
        for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
            List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
            for (String curName : curNames) {
                if (curName.contains(" ") && isFullName(curName, strict)) {
                    nlpNames.add(curName);
                } else if (isFirstName(curName, strict)) {
                    nlpFallbacks.add(curName);
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }

    if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) {
        nlpNames.clear();
        nlpFallbacks.clear();
        dictionaryNames.clear();
        dictionaryFallbacks.clear();
        nameFinder.clearAdaptiveData();
        for (int s = 0; s < sentences.length; s++) {
            String[] tokens = sentences[s].split("[\\W\\s]|$|^");
            for (int i = 0; i < tokens.length; i++) {
                String first = tokens[i];
                String last = null;
                if (i + 1 < tokens.length) {
                    last = tokens[i + 1];
                }
                if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                    dictionaryNames.add(first + " " + last);
                } else if (!strict && isFirstName(first, strict)) {
                    dictionaryFallbacks.add(first);
                }
            }
            Span[] spans = nameFinder.find(tokens);
            for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
                List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
                for (String curName : curNames) {
                    if (curName.contains(" ") && isFullName(curName, strict)) {
                        nlpNames.add(curName);
                    } else if (isFirstName(curName, strict)) {
                        nlpFallbacks.add(curName);
                    }
                }
            }
        }
    }
    if (nlpNames.isEmpty()) {
        nlpNames = nlpFallbacks;
    }
    if (dictionaryNames.isEmpty()) {
        dictionaryNames = dictionaryFallbacks;
    }
    if (nlpNames.size() == 1) {
        return nlpNames.iterator().next();
    }
    if (nlpFallbacks.size() == 1) {
        return nlpFallbacks.iterator().next();
    }
    if (dictionaryNames.size() == 1) {
        return dictionaryNames.iterator().next();
    }
    if (dictionaryFallbacks.size() == 1) {
        return dictionaryFallbacks.iterator().next();
    }
    return null;
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

@Override
protected Entities getEntitiesImpl(String content) {
    Entities namedEntities = Entities.empty();

    SentenceModel sentenceModel = sentenceModels.get("en");
    SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(content);

    TokenizerModel tm = tokenizerModels.get("en");
    TokenizerME wordBreaker = new TokenizerME(tm);

    for (String sentence : sentences) {
        String[] tokens = wordBreaker.tokenize(sentence);

        List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>();

        POSModel posModel = posModels.get("en");
        POSTaggerME posme = new POSTaggerME(posModel);
        String[] posTags = posme.tag(tokens);

        List<String> npTokens = new LinkedList<>();

        ChunkerModel chunkerModel = chunkerModels.get("en");
        ChunkerME chunkerME = new ChunkerME(chunkerModel);
        Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags);
        String[] chunkStrings = Span.spansToStrings(chunks, tokens);
        for (int i = 0; i < chunks.length; i++) {
            String chunkString = chunkStrings[i];
            logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType());
            if (chunks[i].getType().equals("NP")) {
                npTokens.add(chunkString);
            }/*from   w w w .j av a2  s .com*/
        }

        // findEntities(namedEntities, allTextAnnotations,
        // npTokens.toArray(new String[0]));
        findEntities(namedEntities, allTextAnnotations, tokens);
    }

    return namedEntities;
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }// w  w w. j a v  a  2  s .co  m
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");

        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                    tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) { //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(),
                    context, confidence);

            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.apache.tika.parser.geo.topic.NameEntityExtractor.java

public void getAllNameEntitiesfromInput(InputStream stream) throws IOException {
    String[] in = IOUtils.toString(stream, UTF_8).split(" ");
    Span nameE[];//  w w  w . j  av a  2s  .co  m

    //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
    synchronized (nameFinder) {
        nameE = nameFinder.find(in);
        //the same name finder is reused, so clear adaptive data
        nameFinder.clearAdaptiveData();
    }

    String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
    spanNames = spanNames.substring(1, spanNames.length() - 1);
    String[] tmp = spanNames.split(",");

    for (String name : tmp) {
        name = name.trim();
        this.locationNameEntities.add(name);
    }

}