List of usage examples for edu.stanford.nlp.ling CoreLabel equals
@SuppressWarnings("unchecked") @Override public boolean equals(Object obj)
From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();/*from w ww . jav a 2 s .c om*/ String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible") .tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.valueOf(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.valueOf(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // term things List<List<Mention>> termMentions = new ArrayList<List<Mention>>(); if (use_term) { String dataCrf = ""; System.err.print("FEAT TYPE: "); System.err .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)); if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP) .equals(MyConstants.TTE_FEATURE_NLTK)) { dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno); } else { dataCrf = CrfFormatter.annotationToCrfString(docAnno); } List<List<String>> tagResult = new ArrayList<List<String>>(); try { tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL)); if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) { String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA); File crfDataFile = new File(crfDataFilename); BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile)); bw.write(dataCrf); bw.close(); } } catch (Exception e) { System.err.println("Crfsuite tag failed"); } termAsMentionFinder.setTags(tagResult); termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); maxID = termAsMentionFinder.getMaxID(); } // extract predicted mentions allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) { termMentions = injectPronoun(termMentions, allPredictedMentions); } if (experimentType != null) { if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) { List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) { List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) { allPredictedMentions = termMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) { List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else { System.err.println(experimentType); System.err.println("Unknown experiment type. Using mention detector."); } } else if (useGoldMention) { allPredictedMentions = allGoldMentions; } // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
From source file:knu.univ.lingvo.coref.MUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();/*from w w w . ja v a2s .co m*/ String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.parseInt(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.parseInt(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // extract predicted mentions if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; else allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }