Example usage for opennlp.tools.sentdetect SentenceDetectorME SentenceDetectorME

Introduction

In this page you can find the example usage for opennlp.tools.sentdetect SentenceDetectorME SentenceDetectorME.

Prototype

public SentenceDetectorME(SentenceModel model)

Source Link

Document

Initializes the current instance.

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java

@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);

    sentenceModelProvider = new CasConfigurableProviderBase<SentenceDetectorME>() {
        {/* w  w w  .j  a  va2  s .  c om*/
            setDefault(VERSION, "20120616.0");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID,
                    "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "sentence-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected SentenceDetectorME produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                SentenceModel model = new SentenceModel(is);
                return new SentenceDetectorME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };

    tokenModelProvider = new CasConfigurableProviderBase<TokenizerME>() {
        {
            setDefault(VERSION, "1.5");
            setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
            setDefault(ARTIFACT_ID, "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-${language}-${variant}");

            setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/"
                    + "token-${language}-${variant}.bin");
            setDefault(VARIANT, "maxent");

            setOverride(LOCATION, modelLocation);
            setOverride(LANGUAGE, language);
            setOverride(VARIANT, variant);
        }

        @Override
        protected TokenizerME produceResource(URL aUrl) throws IOException {
            InputStream is = null;
            try {
                is = aUrl.openStream();
                TokenizerModel model = new TokenizerModel(is);
                return new TokenizerME(model);
            } finally {
                closeQuietly(is);
            }
        }
    };
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Annotates the document using the Apache OpenNLP tools.
 *
 * @param component the component to annotate.
 *//*from w  w  w .  j a va  2s . c  o m*/
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    // set up the annotator
    setup();

    // Language tag used to retrieve the datasets
    String langTag = component.getLanguage().getLanguage();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(langTag + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(component.getText());

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(langTag + "-token");
    POSModel POSModel = getPOSTaggerModel(langTag + "-pos-maxent");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    for (String sentenceString : sentences) {

        // the distilled sentence object
        Sentence sentence = new Sentence(sentenceString, "" + sentenceCounter++);
        sentence.setLanguage(component.getLanguage());

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);

        // POS tag the tokens
        POSTaggerME tagger = new POSTaggerME(POSModel);
        String tags[] = tagger.tag(tokens);

        // put the features detected by OpenNLP in the distiller's
        // sentence
        for (int i = 0; i < tokens.length; i++) {
            Token t = new Token(tokens[i]);
            t.setPoS(tags[i]);
            sentence.addToken(t);

        } // for 
        ((DocumentComposite) component).addComponent(sentence);

    } // for (String sentenceString : sentences)
}

From source file:com.screenslicer.core.nlp.NlpUtil.java

public static String[] sentences(String src) {
    if (CommonUtil.isEmpty(src)) {
        return new String[0];
    }/*from   w  w w.  j a  v  a2  s.  co  m*/
    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
    return sentenceDetector.sentDetect(src);
}

From source file:edu.stanford.muse.index.NER.java

public synchronized static void initialize() throws ClassCastException, IOException, ClassNotFoundException {
    if (pFinder != null)
        return;//  w  w w  .j av a  2 s  . c o  m
    long startTimeMillis = System.currentTimeMillis();
    log.info("Initializing NER models");

    try {
        InputStream pis = Config.getResourceAsStream("models/en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        pFinder = new NameFinderME(pmodel);

        InputStream lis = Config.getResourceAsStream("models/en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        lFinder = new NameFinderME(lmodel);

        InputStream ois = Config.getResourceAsStream("models/en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        oFinder = new NameFinderME(omodel);
    }
    //dont bother about this, instead try not to use it
    catch (Exception e) {
        Util.print_exception(e, log);
    }
    try {
        InputStream modelIn = Config.getResourceAsStream("models/en-sent.bin");
        SentenceModel model = new SentenceModel(modelIn);
        sFinder = new SentenceDetectorME(model);

        InputStream tokenStream = Config.getResourceAsStream("models/en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        tokenizer = new TokenizerME(modelTokenizer);
    } catch (Exception e) {
        Util.print_exception(e);
    }

    long endTimeMillis = System.currentTimeMillis();
    log.info("Done initializing NER model in " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
}

From source file:it.uniud.ailab.dcore.wrappers.external.OpenNlpBootstrapperAnnotator.java

/**
 * Utility offered to other elements of the pipeline for text tokenizing.
 *
 * @param text the text to tokenize/*from  w w  w .  j ava 2s  . c o  m*/
 * @param language the language of the input text
 * @return an array containing the tokenized text.
 */
public static String[] tokenizeText(String text, String language) {

    setup();

    // Split the text into sentences
    SentenceModel sentModel = getSentenceModel(language + "-sent");

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentModel);
    String sentences[] = sentenceDetector.sentDetect(text);

    // Get the right models
    TokenizerModel tokenModel = getTokenizerModel(language + "-token");

    // Iterate through sentences and produce the distilled objects, 
    // i.e. a sentence object with pos-tagged and stemmed tokens.
    List<String> tokenizedText = new ArrayList<>();

    for (String sentenceString : sentences) {

        // Tokenize the sentence
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        String tokens[] = tokenizer.tokenize(sentenceString);
        for (String token : tokens) {
            tokenizedText.add(token);
        }
    }
    return tokenizedText.toArray(new String[tokenizedText.size()]);
}

From source file:org.sglover.nlp.CoreNLPEntityTagger.java

@Override
protected Entities getEntitiesImpl(String content) {
    Entities namedEntities = Entities.empty();

    SentenceModel sentenceModel = sentenceModels.get("en");
    SentenceDetector sentenceDetector = new SentenceDetectorME(sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(content);

    TokenizerModel tm = tokenizerModels.get("en");
    TokenizerME wordBreaker = new TokenizerME(tm);

    for (String sentence : sentences) {
        String[] tokens = wordBreaker.tokenize(sentence);

        List<TextAnnotation> allTextAnnotations = new LinkedList<TextAnnotation>();

        POSModel posModel = posModels.get("en");
        POSTaggerME posme = new POSTaggerME(posModel);
        String[] posTags = posme.tag(tokens);

        List<String> npTokens = new LinkedList<>();

        ChunkerModel chunkerModel = chunkerModels.get("en");
        ChunkerME chunkerME = new ChunkerME(chunkerModel);
        Span[] chunks = chunkerME.chunkAsSpans(tokens, posTags);
        String[] chunkStrings = Span.spansToStrings(chunks, tokens);
        for (int i = 0; i < chunks.length; i++) {
            String chunkString = chunkStrings[i];
            logger.info("Chunk = " + chunkString + ", type = " + chunks[i].getType());
            if (chunks[i].getType().equals("NP")) {
                npTokens.add(chunkString);
            }//  w  w w . ja v a2s  .  c o  m
        }

        // findEntities(namedEntities, allTextAnnotations,
        // npTokens.toArray(new String[0]));
        findEntities(namedEntities, allTextAnnotations, tokens);
    }

    return namedEntities;
}

From source file:com.civprod.writerstoolbox.OpenNLP.training.SentenceDetectorTrainer.java

private void cmdTrainSentenceDetectorActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdTrainSentenceDetectorActionPerformed
    final SentenceDetectorTrainer tempThis = this;
    new Thread(() -> {
        textTestResults.setText("");
        Charset charset = Charset.forName("UTF-8");
        //read other models
        SentenceDetector stdDetector = null;
        try {/*ww  w .  j  a v  a  2  s. c  o m*/
            stdDetector = OpenNLPUtils.createSentenceDetector();

        } catch (IOException ex) {
        }

        List<FileSplit> FileSplits = FileSplit.generateFileSplitsLOO(mFileCollectionListModel);
        File trainingFile = new File("en-sent.train");
        File testFile = new File("en-sent.test");
        SummaryStatistics curFStats = new SummaryStatistics();
        SummaryStatistics curRecallStats = new SummaryStatistics();
        SummaryStatistics curPrecisionStats = new SummaryStatistics();
        SummaryStatistics stdFStats = new SummaryStatistics();
        SummaryStatistics stdRecallStats = new SummaryStatistics();
        SummaryStatistics stdPrecisionStats = new SummaryStatistics();
        java.io.BufferedOutputStream trainingFileWriter = null;
        for (FileSplit curFileSplit : FileSplits) {
            try {
                //create training file
                trainingFileWriter = new java.io.BufferedOutputStream(
                        new java.io.FileOutputStream(trainingFile));
                for (File curTrainingFile : curFileSplit.getTrainingFiles()) {
                    java.io.BufferedInputStream curTrainingFileReader = null;
                    try {
                        curTrainingFileReader = new java.io.BufferedInputStream(
                                new java.io.FileInputStream(curTrainingFile));
                        while (curTrainingFileReader.available() > 0) {
                            trainingFileWriter.write(curTrainingFileReader.read());
                        }
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    } finally {
                        if (curTrainingFileReader != null) {
                            curTrainingFileReader.close();
                        }
                    }
                }
                trainingFileWriter.write('\n');
            } catch (IOException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (trainingFileWriter != null) {
                    try {
                        trainingFileWriter.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
            //create test file
            java.io.BufferedOutputStream testFileWriter = null;
            try {
                //create training file
                testFileWriter = new java.io.BufferedOutputStream(new java.io.FileOutputStream(testFile));
                for (File curTrainingFile : curFileSplit.getTestFiles()) {
                    String testingFileName = curTrainingFile.getCanonicalPath();
                    textTestResults
                            .setText(textTestResults.getText() + "testing with " + testingFileName + "\n");
                    java.io.BufferedInputStream curTrainingFileReader = null;
                    try {
                        curTrainingFileReader = new java.io.BufferedInputStream(
                                new java.io.FileInputStream(curTrainingFile));
                        while (curTrainingFileReader.available() > 0) {
                            int read = curTrainingFileReader.read();
                            testFileWriter.write(read);
                        }
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    } finally {
                        if (curTrainingFileReader != null) {
                            curTrainingFileReader.close();
                        }
                    }
                }
                testFileWriter.write('\n');
            } catch (IOException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (testFileWriter != null) {
                    try {
                        testFileWriter.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
            //create SentenceDetectorFactory part of the training context
            SentenceDetectorFactory mySentenceDetectorFactory = new SentenceDetectorFactory("EN",
                    cbUseTokenEnd.isSelected(), mAbbreviationDictionary, txtEosChars.getText().toCharArray());

            ObjectStream<String> trainingLineStream = null;
            SentenceModel train = null;
            try {
                trainingLineStream = new PlainTextByLineStream(new FileInputStream(trainingFile), charset);
                ObjectStream<SentenceSample> sampleStream = null;
                try {
                    sampleStream = new SentenceSampleStream(trainingLineStream);
                    train = SentenceDetectorME.train("EN", sampleStream, mySentenceDetectorFactory,
                            TrainingParameters.defaultParams());
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (sampleStream != null) {
                        try {
                            sampleStream.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null,
                                    ex);
                        }
                    }
                }
            } catch (FileNotFoundException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (trainingLineStream != null) {
                    try {
                        trainingLineStream.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
            trainingLineStream = null;
            if (train != null) {
                ObjectStream<String> testingLineStream = null;
                try {
                    testingLineStream = new PlainTextByLineStream(new FileInputStream(testFile), charset);
                    ObjectStream<SentenceSample> sampleStream = null;
                    try {
                        sampleStream = new SentenceSampleStream(testingLineStream);
                        SentenceDetectorME testDetector = new SentenceDetectorME(train);
                        SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(testDetector);
                        evaluator.evaluate(sampleStream);
                        FMeasure testFMeasure = evaluator.getFMeasure();
                        curFStats.addValue(testFMeasure.getFMeasure());
                        curRecallStats.addValue(testFMeasure.getRecallScore());
                        curPrecisionStats.addValue(testFMeasure.getPrecisionScore());
                        textTestResults.setText(textTestResults.getText() + testFMeasure.getFMeasure() + " "
                                + testFMeasure.getPrecisionScore() + " " + testFMeasure.getRecallScore()
                                + "\n");
                        if (stdDetector != null) {
                            testingLineStream = new PlainTextByLineStream(new FileInputStream(testFile),
                                    charset);
                            sampleStream = new SentenceSampleStream(testingLineStream);
                            SentenceDetectorEvaluator stdEvaluator = new SentenceDetectorEvaluator(stdDetector);
                            stdEvaluator.evaluate(sampleStream);
                            FMeasure stdFMeasure = stdEvaluator.getFMeasure();
                            stdFStats.addValue(stdFMeasure.getFMeasure());
                            stdRecallStats.addValue(stdFMeasure.getRecallScore());
                            stdPrecisionStats.addValue(stdFMeasure.getPrecisionScore());
                            textTestResults.setText(textTestResults.getText() + " " + stdFMeasure.getFMeasure()
                                    + " " + stdFMeasure.getPrecisionScore() + " " + stdFMeasure.getRecallScore()
                                    + "\n");
                        }
                        textTestResults.setText(textTestResults.getText() + "\n");
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    } finally {
                        if (sampleStream != null) {
                            try {
                                sampleStream.close();
                            } catch (IOException ex) {
                                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE,
                                        null, ex);
                            }
                        }
                    }
                } catch (FileNotFoundException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (testingLineStream != null) {
                        try {
                            testingLineStream.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null,
                                    ex);
                        }
                    }
                }
            }
        }
        textTestResults.setText(textTestResults.getText() + "\n");
        textTestResults.setText(textTestResults.getText() + "test model\n");
        textTestResults.setText(textTestResults.getText() + "f score mean " + curFStats.getMean() + " stdDev "
                + curFStats.getStandardDeviation() + "\n");
        textTestResults.setText(textTestResults.getText() + "recall mean " + curRecallStats.getMean()
                + " stdDev " + curRecallStats.getStandardDeviation() + "\n");
        textTestResults.setText(textTestResults.getText() + "precision score mean "
                + curPrecisionStats.getMean() + " stdDev " + curPrecisionStats.getStandardDeviation() + "\n");
        textTestResults.setText(textTestResults.getText() + "std model\n");
        textTestResults.setText(textTestResults.getText() + "f score mean " + stdFStats.getMean() + " stdDev "
                + stdFStats.getStandardDeviation() + "\n");
        textTestResults.setText(textTestResults.getText() + "recall mean " + stdRecallStats.getMean()
                + " stdDev " + stdRecallStats.getStandardDeviation() + "\n");
        textTestResults.setText(textTestResults.getText() + "precision score mean "
                + stdPrecisionStats.getMean() + " stdDev " + stdPrecisionStats.getStandardDeviation() + "\n");
        //create combinded training file
        trainingFileWriter = null;
        try {
            trainingFileWriter = new java.io.BufferedOutputStream(new java.io.FileOutputStream(trainingFile));
            for (File curTrainingFile : mFileCollectionListModel) {
                java.io.BufferedInputStream curTrainingFileReader = null;
                try {
                    curTrainingFileReader = new java.io.BufferedInputStream(
                            new java.io.FileInputStream(curTrainingFile));
                    while (curTrainingFileReader.available() > 0) {
                        trainingFileWriter.write(curTrainingFileReader.read());
                    }
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (curTrainingFileReader != null) {
                        curTrainingFileReader.close();
                    }
                }
            }
            trainingFileWriter.write('\n');
        } catch (IOException ex) {
            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            if (trainingFileWriter != null) {
                try {
                    trainingFileWriter.close();
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }
        //create SentenceDetectorFactory part of the training context
        SentenceDetectorFactory mySentenceDetectorFactory = new SentenceDetectorFactory("EN",
                cbUseTokenEnd.isSelected(), mAbbreviationDictionary, txtEosChars.getText().toCharArray());
        //create and train model
        ObjectStream<String> lineStream = null;
        this.createdObject = null;
        try {
            lineStream = new PlainTextByLineStream(new FileInputStream(trainingFile), charset);
            ObjectStream<SentenceSample> sampleStream = null;
            try {
                sampleStream = new SentenceSampleStream(lineStream);
                this.createdObject = SentenceDetectorME.train("EN", sampleStream, mySentenceDetectorFactory,
                        TrainingParameters.defaultParams());
            } catch (IOException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (sampleStream != null) {
                    try {
                        sampleStream.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
        } catch (FileNotFoundException ex) {
            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            if (lineStream != null) {
                try {
                    lineStream.close();
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }
        if (createdObject != null) {
            OutputStream modelOut = null;
            File modelFile = new File("en-fiction-sent.bin");
            try {
                modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
                createdObject.serialize(modelOut);
            } catch (IOException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (modelOut != null) {
                    try {
                        modelOut.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
        }
        textTestResults.setText(textTestResults.getText() + "done");
    }).start();
}

From source file:org.apache.stanbol.commons.opennlp.OpenNLP.java

/**
 * Getter for the sentence detector of the parsed language. 
 * @param language the language//from  www .j  a v a2s.  c  o  m
 * @return the model or <code>null</code> if no model data are found
 * @throws InvalidFormatException in case the found model data are in the wrong format
 * @throws IOException on any error while reading the model data
 */
public SentenceDetector getSentenceDetector(String language) throws IOException {
    SentenceModel sentModel = getSentenceModel(language);
    if (sentModel != null) {
        return new SentenceDetectorME(sentModel);
    } else {
        log.debug("No Sentence Detection Model for language '{}'", language);
        return null;
    }
}

From source file:org.apache.stanbol.enhancer.engines.opennlp.impl.NEREngineCore.java

protected Map<String, List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
        String text, String language) {
    // version with explicit sentence endings to reflect heading / paragraph
    // structure of an HTML or PDF document converted to text
    String textWithDots = text.replaceAll("\\n\\n", ".\n");
    text = removeNonUtf8CompliantCharacters(text);

    SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

    Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

    NameFinderME finder = new NameFinderME(nameFinderModel);
    Tokenizer tokenizer = openNLP.getTokenizer(language);
    Map<String, List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String, List<NameOccurrence>>();
    for (int i = 0; i < sentenceSpans.length; i++) {
        String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

        // build a context by concatenating three sentences to be used for
        // similarity ranking / disambiguation + contextual snippet in the
        // extraction structure
        List<String> contextElements = new ArrayList<String>();
        if (i > 0) {
            CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
            contextElements.add(previousSentence.toString().trim());
        }//from   w w  w  .  j  a  v  a  2  s.  co m
        contextElements.add(sentence.trim());
        if (i + 1 < sentenceSpans.length) {
            CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
            contextElements.add(nextSentence.toString().trim());
        }
        String context = StringUtils.join(contextElements, " ");

        // extract the names in the current sentence and
        // keep them store them with the current context
        Span[] tokenSpans = tokenizer.tokenizePos(sentence);
        String[] tokens = Span.spansToStrings(tokenSpans, sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();
        //int lastStartPosition = 0;
        for (int j = 0; j < nameSpans.length; j++) {
            String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                    tokenSpans[nameSpans[j].getEnd() - 1].getEnd());
            //NOTE: With OpenNLP 1.6 the probability is now stored in the span
            double prob = nameSpans[j].getProb();
            //prob == 0.0 := unspecified
            Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
            if (confidence == null) { //fall back to the old if it is not set.
                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                    prob *= probs[k];
                }
                confidence = Double.valueOf(prob);
            } else if (confidence < 0.5d) {
                //It looks like as if preceptron based models do return
                //invalid probabilities. As it is expected the Named Entities
                //with a probability < 50% are not even returned by finder.find(..)
                //we will just ignore confidence values < 0.5 here
                confidence = null;
            }
            int start = tokenSpans[nameSpans[j].getStart()].getStart();
            int absoluteStart = sentenceSpans[i].getStart() + start;
            int absoluteEnd = absoluteStart + name.length();
            NerTag nerTag = config.getNerTag(nameSpans[j].getType());
            NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, nerTag.getType(),
                    context, confidence);

            List<NameOccurrence> occurrences = nameOccurrences.get(name);
            if (occurrences == null) {
                occurrences = new ArrayList<NameOccurrence>();
            }
            occurrences.add(occurrence);
            nameOccurrences.put(name, occurrences);
        }
    }
    finder.clearAdaptiveData();
    log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
    return nameOccurrences;
}

From source file:org.dbpedia.spotlight.spot.NESpotter.java

protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
    String intext = text.text();// w w w . j  a va 2s .  c om
    SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel) sentenceModel);
    String[] sentences = sentenceDetector.sentDetect(intext);
    Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
    int[] sentencePositions = new int[sentences.length + 1];
    for (int k = 0; k < sentenceEndings.length; k++) {
        sentencePositions[k] = sentenceEndings[k].getStart();
    }

    NameFinderME finder = new NameFinderME((TokenNameFinderModel) nameFinderModel);

    List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
    Tokenizer tokenizer = new SimpleTokenizer();
    for (int i = 0; i < sentences.length; i++) {
        String sentence = sentences[i];
        //LOG.debug("Sentence: " + sentence);

        // extract the names in the current sentence
        String[] tokens = tokenizer.tokenize(sentence);
        Span[] tokenspan = tokenizer.tokenizePos(sentence);
        Span[] nameSpans = finder.find(tokens);
        double[] probs = finder.probs();

        if (nameSpans != null && nameSpans.length > 0) {
            //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
            //System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
            for (Span span : nameSpans) {
                StringBuilder buf = new StringBuilder();
                //System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    //System.out.println(tokens[i] + " appended to " + buf.toString());
                    buf.append(tokens[j]);
                    if (j < span.getEnd() - 1)
                        buf.append(" ");
                }
                String surfaceFormStr = buf.toString().trim();
                if (surfaceFormStr.contains(".")) {
                    surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
                }

                int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
                int entEnd = sentencePositions[i] + tokenspan[span.getEnd() - 1].getEnd();

                /*
                System.out.println("\n\nRR-NE Found = " + buf.toString());
                System.out.println("Start = " + entStart);
                System.out.println("End = " + entEnd);
                System.out.println("Sentence = " + sentence);
                System.out.println("Text = " + text);
                */

                SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
                SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
                sfocc.features().put("type", new Feature("type", oType.toString()));
                sfOccurrences.add(sfocc);
            }
        }

    }
    finder.clearAdaptiveData();

    if (LOG.isDebugEnabled()) {
        LOG.debug("Occurrences found: " + StringUtils.join(sfOccurrences, ", "));
    }
    return sfOccurrences;
}