Example usage for edu.stanford.nlp.pipeline Annotation set

Introduction

In this page you can find the example usage for edu.stanford.nlp.pipeline Annotation set.

Prototype

@Override
@SuppressWarnings("unchecked")
public <VALUE> VALUE set(Class<? extends Key<VALUE>> key, VALUE value)

Source Link

Usage

From source file:StanfordCoreNLPXMLServer.java

License:Open Source License

public String parse(String s, String date) throws java.io.IOException {
    Annotation annotation = new Annotation(s);
    annotation.set(CoreAnnotations.DocDateAnnotation.class, date);
    pipeline.annotate(annotation);/*from w w w.ja v a 2  s .c om*/

    StringBuilder sb = new StringBuilder();

    List<CoreMap> timexAnnsAll = annotation.get(TimeAnnotations.TimexAnnotations.class);

    sb.append(xmlPrefix);
    for (CoreMap cm : timexAnnsAll) {
        Timex t = cm.get(TimeAnnotations.TimexAnnotation.class);
        sb.append(t);
        sb.append("\n");
    }
    sb.append(xmlPostfix);

    return sb.toString();
}

From source file:com.epictodo.controller.nlp.SentenceAnalysis.java

License:Open Source License

/**
 * This method analyzes the date and time format in a sentence
 * The analyzer will extract the date and time structure from the sentence into tokens
 *
 * @param _sentence/*  w  ww . ja  va 2s  .  c o  m*/
 * @return _results
 */
public Map<String, String> dateTimeAnalyzer(String _sentence) throws ParseException {
    Map<String, String> _results = new TreeMap<>();
    SimpleDateFormat date_format = new SimpleDateFormat("yyyy-MM-dd");
    String _prev;
    String _current = "";
    String _latest = "";

    Annotation _document = new Annotation(_sentence);
    _document.set(CoreAnnotations.DocDateAnnotation.class, date_validator.getTodayDate());
    _pipeline.annotate(_document);
    List<CoreMap> timex_annotations = _document.get(TimeAnnotations.TimexAnnotations.class);

    for (CoreMap _tokens : timex_annotations) {
        _tokens.get(CoreAnnotations.TokensAnnotation.class);
        _prev = _tokens.get(TimeExpression.Annotation.class).getTemporal().toString();

        /**
         * Algorithm to check if the date is the latest date or an earlier date
         * Checks to get the latest date into the results
         *
         * This is a simple check for cases like:
         * 1. next Tuesday from 1:00pm to 4:00pm.
         *    -> this will result in changing the date of 1:00pm & 4:00pm to next Tuesday instead of Today's date
         * 2. If today is Tuesday, "on Wednesday at 9am"
         *    -> this will result in identifying that today is 'Tuesday'
         *    -> return tomorrow's date at 9:00am
         *    -> for example, "2014-10-29-WXX-3T09:00"
         * 3. 3 days later at 10am
         *    -> this will correctly identify 3 days laters' date
         *    -> for example, "2014-10-31T10:00"
         * 4. 3 days later from 10am to 15:00pm
         *    -> this will break the sentence into tokens as following into a Map
         *    -> for example, "{3pm=2014-10-31, 3 days later=2014-10-31, 10am=2014-10-31}"
         */
        if (_current.equals(""))
            _latest = _current = _prev;
        else if (!_current.equals("")) {
            Date date_prev = date_format.parse(_prev);
            Date date_current = date_format.parse(_current);

            if (date_prev.compareTo(date_current) <= 0) {
                _latest = _current;
            }
        }

        _results.put(_tokens.toString(), _latest);
    }

    return _results;
}

From source file:com.epictodo.controller.nlp.SentenceAnalysis.java

License:Open Source License

/**
 * This method analyzes the sentence structure and returns a Map of word token and NER token
 *
 * @param _sentence//from   ww  w  . j a v  a2  s.c o m
 * @return _results
 */
public Map<String, String> sentenceAnalyzer(String _sentence) {
    Map<String, String> _results = new TreeMap<>();
    Annotation _document = new Annotation(_sentence);
    _document.set(CoreAnnotations.DocDateAnnotation.class, date_validator.getTodayDate());
    _pipeline.annotate(_document);
    List<CoreMap> _sentences = _document.get(CoreAnnotations.SentencesAnnotation.class);

    for (CoreMap sentence : _sentences) {
        // Traverse the tokens of words in the current sentence
        for (CoreLabel _tokens : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            // Text of the token
            String word = _tokens.get(CoreAnnotations.TextAnnotation.class);
            // POS tag of the token
            String pos = _tokens.get(CoreAnnotations.PartOfSpeechAnnotation.class);
            // NER label of the token
            String ner = _tokens.get(CoreAnnotations.NamedEntityTagAnnotation.class);

            _results.put(word, ner);
        }
    }

    return _results;
}

From source file:com.koalephant.nlp.StanfordCoreNLPHTTPServer.java

License:Open Source License

public String parse(String s, MediaType mediaType) throws IOException {
    Annotation annotation = new Annotation(s);

    DateTime now = new DateTime();

    annotation.set(DocDateAnnotation.class, now.toString(dateTimeFormatter));
    pipeline.annotate(annotation);/*  w  w  w.  jav  a 2s .c om*/
    StringWriter sb = new StringWriter();

    switch (mediaType) {
    case TEXT_XML:
    case APPLICATION_XML:
        pipeline.xmlPrint(annotation, sb);
        break;

    case APPLICATION_JSON:
    case TEXT_JSON:
        pipeline.jsonPrint(annotation, sb);
        break;
    }

    return sb.toString();
}

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();// ww  w .  j a  va  2  s.c  om
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible")
                .tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.valueOf(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.valueOf(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // term things

    List<List<Mention>> termMentions = new ArrayList<List<Mention>>();

    if (use_term) {
        String dataCrf = "";
        System.err.print("FEAT TYPE: ");
        System.err
                .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP));
        if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)
                .equals(MyConstants.TTE_FEATURE_NLTK)) {
            dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno);
        } else {
            dataCrf = CrfFormatter.annotationToCrfString(docAnno);
        }
        List<List<String>> tagResult = new ArrayList<List<String>>();

        try {
            tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL));

            if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) {
                String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA);

                File crfDataFile = new File(crfDataFilename);
                BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile));
                bw.write(dataCrf);
                bw.close();
            }

        } catch (Exception e) {
            System.err.println("Crfsuite tag failed");
        }

        termAsMentionFinder.setTags(tagResult);
        termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

        maxID = termAsMentionFinder.getMaxID();
    }

    // extract predicted mentions

    allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) {
        termMentions = injectPronoun(termMentions, allPredictedMentions);
    }

    if (experimentType != null) {
        if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) {
            List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) {
            allPredictedMentions = termMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else {
            System.err.println(experimentType);
            System.err.println("Unknown experiment type. Using mention detector.");
        }
    } else if (useGoldMention) {
        allPredictedMentions = allGoldMentions;
    }

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java

License:Open Source License

public Annotation convert(JCas aSource, Annotation aTarget) {
    // Document annotation
    aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText());

    // Sentences//from   www. ja v a2s .c  om
    List<CoreMap> sentences = new ArrayList<>();
    for (Sentence s : select(aSource, Sentence.class)) {
        if (StringUtils.isBlank(s.getCoveredText())) {
            continue;
        }

        String sentenceText = s.getCoveredText();
        if (encoding != null && !"UTF-8".equals(encoding.name())) {
            sentenceText = new String(sentenceText.getBytes(StandardCharsets.UTF_8), encoding);
        }

        Annotation sentence = new Annotation(sentenceText);
        sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin());
        sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd());
        sentence.set(SentenceIndexAnnotation.class, sentences.size());

        // Tokens
        Map<Token, IndexedWord> idxTokens = new HashMap<>();
        List<CoreLabel> tokens = new ArrayList<>();
        for (Token t : selectCovered(Token.class, s)) {
            String tokenText = t.getCoveredText();
            if (encoding != null && !"UTF-8".equals(encoding.name())) {
                tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding);
            }

            CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), t.getEnd() - t.getBegin());
            // First add token so that tokens.size() returns a 1-based counting as required
            // by IndexAnnotation
            tokens.add(token);
            token.set(SentenceIndexAnnotation.class, sentences.size());
            token.set(IndexAnnotation.class, tokens.size());
            token.set(TokenKey.class, t);
            idxTokens.put(t, new IndexedWord(token));

            // POS tags
            if (readPos && t.getPos() != null) {
                token.set(PartOfSpeechAnnotation.class, t.getPos().getPosValue());
            }

            // Lemma
            if (t.getLemma() != null) {
                token.set(LemmaAnnotation.class, t.getLemma().getValue());
            }

            // Stem
            if (t.getStem() != null) {
                token.set(StemAnnotation.class, t.getStem().getValue());
            }

            // NamedEntity
            // TODO: only token-based NEs are supported, but not multi-token NEs
            // Supporting multi-token NEs via selectCovering would be very slow. To support
            // them, another approach would need to be implemented, e.g. via indexCovering.
            List<NamedEntity> nes = selectCovered(NamedEntity.class, t);
            if (nes.size() > 0) {
                token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue());
            } else {
                token.set(NamedEntityTagAnnotation.class, "O");
            }
        }

        // Constituents
        for (ROOT r : selectCovered(ROOT.class, s)) {
            Tree tree = createStanfordTree(r, idxTokens);
            tree.indexSpans();
            sentence.set(TreeAnnotation.class, tree);
        }

        // Dependencies
        List<TypedDependency> dependencies = new ArrayList<>();
        for (Dependency d : selectCovered(Dependency.class, s)) {
            TypedDependency dep = new TypedDependency(GrammaticalRelation.valueOf(d.getDependencyType()),
                    idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent()));
            if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) {
                dep.setExtra();
            }
            dependencies.add(dep);
        }
        sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies));

        if (ptb3Escaping) {
            tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd);
        }

        sentence.set(TokensAnnotation.class, tokens);
        sentences.add(sentence);
    }
    aTarget.set(SentencesAnnotation.class, sentences);

    return aTarget;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java

License:Open Source License

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    modelProvider.configure(aJCas.getCas());

    List<Tree> trees = new ArrayList<Tree>();
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    List<List<CoreLabel>> sentenceTokens = new ArrayList<List<CoreLabel>>();
    for (ROOT root : select(aJCas, ROOT.class)) {
        // Copy all relevant information from the tokens
        List<CoreLabel> tokens = new ArrayList<CoreLabel>();
        for (Token token : selectCovered(Token.class, root)) {
            tokens.add(tokenToWord(token));
        }/*from  ww w .ja  v  a 2s.c  o m*/
        sentenceTokens.add(tokens);

        // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace
        // it with PRN to avoid NPEs.
        TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) {
            @Override
            public Tree newTreeNode(String aParent, List<Tree> aChildren) {
                String parent = aParent;
                if ("PRN0".equals(parent)) {
                    parent = "PRN";
                }
                Tree node = super.newTreeNode(parent, aChildren);
                return node;
            }
        };

        // deep copy of the tree. These are modified inside coref!
        Tree treeCopy = TreeUtils.createStanfordTree(root, tFact).treeSkeletonCopy();
        treeCopy.indexSpans();
        trees.add(treeCopy);

        // Build the sentence
        CoreMap sentence = new CoreLabel();
        sentence.set(TreeAnnotation.class, treeCopy);
        sentence.set(TokensAnnotation.class, tokens);
        sentence.set(RootKey.class, root);
        sentences.add(sentence);

        // https://code.google.com/p/dkpro-core-asl/issues/detail?id=590
        // We currently do not copy over dependencies from the CAS. This is supposed to fill
        // in the dependencies so we do not get NPEs.
        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(),
                tlp.typedDependencyHeadFinder());
        ParserAnnotatorUtils.fillInParseAnnotations(false, true, gsf, sentence, treeCopy,
                GrammaticalStructure.Extras.NONE);

        // https://code.google.com/p/dkpro-core-asl/issues/detail?id=582
        SemanticGraph deps = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        for (IndexedWord vertex : deps.vertexSet()) {
            vertex.setWord(vertex.value());
        }

        // merge the new CoreLabels with the tree leaves
        MentionExtractor.mergeLabels(treeCopy, tokens);
        MentionExtractor.initializeUtterance(tokens);
    }

    Annotation document = new Annotation(aJCas.getDocumentText());
    document.set(SentencesAnnotation.class, sentences);

    Coreferencer coref = modelProvider.getResource();

    // extract all possible mentions
    // Reparsing only works when the full CoreNLP pipeline system is set up! Passing false here
    // disables reparsing.
    RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(false);
    List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(document, 0,
            coref.corefSystem.dictionaries());

    // add the relevant info to mentions and order them for coref
    Map<Integer, CorefChain> result;
    try {
        Document doc = coref.mentionExtractor.arrange(document, sentenceTokens, trees, allUnprocessedMentions);
        result = coref.corefSystem.coref(doc);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }

    for (CorefChain chain : result.values()) {
        CoreferenceLink last = null;
        for (CorefMention mention : chain.getMentionsInTextualOrder()) {
            CoreLabel beginLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
                    .get(mention.startIndex - 1);
            CoreLabel endLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
                    .get(mention.endIndex - 2);
            CoreferenceLink link = new CoreferenceLink(aJCas, beginLabel.get(TokenKey.class).getBegin(),
                    endLabel.get(TokenKey.class).getEnd());

            if (mention.mentionType != null) {
                link.setReferenceType(mention.mentionType.toString());
            }

            if (last == null) {
                // This is the first mention. Here we'll initialize the chain
                CoreferenceChain corefChain = new CoreferenceChain(aJCas);
                corefChain.setFirst(link);
                corefChain.addToIndexes();
            } else {
                // For the other mentions, we'll add them to the chain.
                last.setNext(link);
            }
            last = link;

            link.addToIndexes();
        }
    }
}

From source file:edu.jhu.hlt.concrete.stanford.ConcreteStanfordPreCorefAnalytic.java

License:Open Source License

@Override
public TokenizedCommunication annotate(TokenizedCommunication arg0) throws AnalyticException {
    final Communication root = new Communication(arg0.getRoot());
    if (!root.isSetText())
        throw new AnalyticException("communication.text must be set to run this analytic.");
    AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(root);
    AnalyticUUIDGenerator g = f.create();
    final List<Section> sectList = root.getSectionList();
    final String commText = root.getText();

    List<CoreMap> allCoreMaps = new ArrayList<>();
    // String noMarkup = MarkupRewriter.removeMarkup(commText);
    String noMarkup = commText;/*from  w  w w  .j  av  a  2  s .  com*/
    sectList.forEach(sect -> {
        List<CoreMap> cmList = ConcreteToStanfordMapper.concreteSectionToCoreMapList(sect, commText);
        allCoreMaps.addAll(cmList);
    });

    allCoreMaps.forEach(cm -> LOGGER.trace("Got CoreMap pre-coref: {}", cm.toShorterString(new String[0])));
    Annotation anno = new Annotation(allCoreMaps);
    anno.set(TextAnnotation.class, noMarkup);

    // TODO: it's possible that fixNullDependencyGraphs needs to be called
    // before dcoref annotator is called. TB investigated further.
    for (String annotator : this.lang.getPostTokenizationAnnotators()) {
        LOGGER.debug("Running annotator: {}", annotator);
        (StanfordCoreNLP.getExistingAnnotator(annotator)).annotate(anno);
    }

    anno.get(SentencesAnnotation.class)
            .forEach(cm -> LOGGER.trace("Got CoreMaps post-coref: {}", cm.toShorterString(new String[0])));
    // TODO: not sure if this is necessary - found it in the old code.
    anno.get(SentencesAnnotation.class).stream().filter(cm -> cm.containsKey(TreeAnnotation.class))
            .forEach(cm -> {
                Tree tree = cm.get(TreeAnnotation.class);
                List<Tree> treeList = new ArrayList<>();
                treeList.add(tree);
                this.lang.getGrammaticalFactory()
                        .ifPresent(k -> ParserAnnotatorUtils.fillInParseAnnotations(false, true, k, cm,
                                treeList.get(0), GrammaticalStructure.Extras.NONE));
            });

    anno.get(SentencesAnnotation.class)
            .forEach(cm -> LOGGER.trace("Got CoreMap post-fill-in: {}", cm.toShorterString(new String[0])));
    List<Sentence> postSentences = annotationToSentenceList(anno, hf, arg0.getSentences(), g);
    postSentences.forEach(st -> LOGGER.trace("Got pre-coref sentence: {}", st.toString()));
    Map<TextSpan, Sentence> tsToSentenceMap = new HashMap<>();
    postSentences.forEach(st -> tsToSentenceMap.put(st.getTextSpan(), st));
    tsToSentenceMap.keySet().forEach(k -> LOGGER.trace("Got TextSpan key: {}", k.toString()));

    sectList.forEach(sect -> {
        List<Sentence> sentList = sect.getSentenceList();
        sentList.forEach(st -> {
            TextSpan ts = st.getTextSpan();
            LOGGER.debug("Trying to find span: {}", ts.toString());
            if (tsToSentenceMap.containsKey(ts)) {
                Sentence newSent = tsToSentenceMap.get(ts);
                st.setTokenization(newSent.getTokenization());
            } else {
                throw new RuntimeException("Didn't find sentence in the new sentences. Old sentence UUID: "
                        + st.getUuid().getUuidString());
            }
        });
    });

    try {
        // Coref.
        CorefManager coref = new CorefManager(new CachedTokenizationCommunication(root), anno);
        TokenizedCommunication tcWithCoref = coref.addCoreference();
        return tcWithCoref;
    } catch (MiscommunicationException e) {
        throw new AnalyticException(e);
    }
}

From source file:gov.llnl.ontology.util.AnnotationUtil.java

License:Open Source License

/**
 * Sets the part of speech tag for {@code annot}.
 *///from w ww.  ja v a  2  s  .c o  m
public static void setPos(Annotation annot, String pos) {
    annot.set(PartOfSpeechAnnotation.class, pos);
}

From source file:gov.llnl.ontology.util.AnnotationUtil.java

License:Open Source License

/**
 * Sets the token for {@code annot}/*w w  w  . jav a 2  s . c  om*/
 */
public static void setWord(Annotation annot, String word) {
    annot.set(TextAnnotation.class, word);
}