Example usage for edu.stanford.nlp.ling CoreLabel equals

List of usage examples for edu.stanford.nlp.ling CoreLabel equals

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel equals.

Prototype

@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) 

Source Link

Document

Two CoreMaps are equal iff all keys and values are .equal.

Usage

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();/*from  w ww .  jav  a 2  s  .c  om*/
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible")
                .tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.valueOf(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.valueOf(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // term things

    List<List<Mention>> termMentions = new ArrayList<List<Mention>>();

    if (use_term) {
        String dataCrf = "";
        System.err.print("FEAT TYPE: ");
        System.err
                .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP));
        if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)
                .equals(MyConstants.TTE_FEATURE_NLTK)) {
            dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno);
        } else {
            dataCrf = CrfFormatter.annotationToCrfString(docAnno);
        }
        List<List<String>> tagResult = new ArrayList<List<String>>();

        try {
            tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL));

            if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) {
                String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA);

                File crfDataFile = new File(crfDataFilename);
                BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile));
                bw.write(dataCrf);
                bw.close();
            }

        } catch (Exception e) {
            System.err.println("Crfsuite tag failed");
        }

        termAsMentionFinder.setTags(tagResult);
        termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

        maxID = termAsMentionFinder.getMaxID();
    }

    // extract predicted mentions

    allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) {
        termMentions = injectPronoun(termMentions, allPredictedMentions);
    }

    if (experimentType != null) {
        if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) {
            List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) {
            allPredictedMentions = termMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else {
            System.err.println(experimentType);
            System.err.println("Unknown experiment type. Using mention detector.");
        }
    } else if (useGoldMention) {
        allPredictedMentions = allGoldMentions;
    }

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:knu.univ.lingvo.coref.MUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();/*from   w w w .  ja v  a2s  .co m*/
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.parseInt(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.parseInt(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS)
        allPredictedMentions = allGoldMentions;
    else
        allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}