List of usage examples for edu.stanford.nlp.util Generics newHashMap
public static <K, V> Map<K, V> newHashMap()
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>/*from ww w . j a v a 2 s . com*/ * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();//from ww w . j a v a2 s. c o m String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible") .tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.valueOf(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.valueOf(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // term things List<List<Mention>> termMentions = new ArrayList<List<Mention>>(); if (use_term) { String dataCrf = ""; System.err.print("FEAT TYPE: "); System.err .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)); if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP) .equals(MyConstants.TTE_FEATURE_NLTK)) { dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno); } else { dataCrf = CrfFormatter.annotationToCrfString(docAnno); } List<List<String>> tagResult = new ArrayList<List<String>>(); try { tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL)); if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) { String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA); File crfDataFile = new File(crfDataFilename); BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile)); bw.write(dataCrf); bw.close(); } } catch (Exception e) { System.err.println("Crfsuite tag failed"); } termAsMentionFinder.setTags(tagResult); termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); maxID = termAsMentionFinder.getMaxID(); } // extract predicted mentions allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) { termMentions = injectPronoun(termMentions, allPredictedMentions); } if (experimentType != null) { if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) { List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) { List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) { allPredictedMentions = termMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) { List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else { System.err.println(experimentType); System.err.println("Unknown experiment type. Using mention detector."); } } else if (useGoldMention) { allPredictedMentions = allGoldMentions; } // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
private void initLexicon(SeqClassifierFlags flags) { if (flags.distSimLexicon == null) { return;/*from w w w .j a v a 2s . co m*/ } if (lexicon != null) { return; } Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon); lexicon = Generics.newHashMap(); boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat); for (String line : ObjectBank.getLineIterator(flags.distSimLexicon, flags.inputEncoding)) { String word; String wordClass; if (terryKoo) { String[] bits = line.split("\\t"); word = bits[1]; wordClass = bits[0]; if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) { wordClass = wordClass.substring(0, flags.distSimMaxBits); } } else { // "alexClark" String[] bits = line.split("\\s+"); word = bits[0]; wordClass = bits[1]; } if (!flags.casedDistSim) { word = word.toLowerCase(); } if (flags.numberEquivalenceDistSim) { word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS); } lexicon.put(word, wordClass); } Timing.endDoing(); }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
public void clearMemory() { wordToSubstrings = Generics.newHashMap(); lexicon = null; }
From source file:knu.univ.lingvo.coref.ACEMentionExtractor.java
License:Open Source License
private static void printRawDoc(List<CoreMap> sentences, List<List<Mention>> allMentions, String filename, boolean gold) throws FileNotFoundException { StringBuilder doc = new StringBuilder(); int previousOffset = 0; Counter<Integer> mentionCount = new ClassicCounter<Integer>(); for (List<Mention> l : allMentions) { for (Mention m : l) { mentionCount.incrementCount(m.goldCorefClusterID); }/* w w w . ja v a2s . co m*/ } for (int i = 0; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); List<Mention> mentions = allMentions.get(i); String[] tokens = sentence.get(CoreAnnotations.TextAnnotation.class).split(" "); String sent = ""; List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class); if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) sent += "\n"; previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); Counter<Integer> startCounts = new ClassicCounter<Integer>(); Counter<Integer> endCounts = new ClassicCounter<Integer>(); Map<Integer, Set<Integer>> endID = Generics.newHashMap(); for (Mention m : mentions) { startCounts.incrementCount(m.startIndex); endCounts.incrementCount(m.endIndex); if (!endID.containsKey(m.endIndex)) endID.put(m.endIndex, Generics.<Integer>newHashSet()); endID.get(m.endIndex).add(m.goldCorefClusterID); } for (int j = 0; j < tokens.length; j++) { if (endID.containsKey(j)) { for (Integer id : endID.get(j)) { if (mentionCount.getCount(id) != 1 && gold) sent += "]_" + id; else sent += "]"; } } for (int k = 0; k < startCounts.getCount(j); k++) { if (!sent.endsWith("[")) sent += " "; sent += "["; } sent += " "; sent = sent + tokens[j]; } for (int k = 0; k < endCounts.getCount(tokens.length); k++) { sent += "]"; } sent += "\n"; doc.append(sent); } if (gold) logger.fine("New DOC: (GOLD MENTIONS) =================================================="); else logger.fine("New DOC: (Predicted Mentions) =================================================="); logger.fine(doc.toString()); }
From source file:knu.univ.lingvo.coref.CorefChain.java
License:Open Source License
public CorefChain(CorefCluster c, Map<Mention, IntTuple> positions) { chainID = c.clusterID;//ww w.j a v a2s.c om // Collect mentions mentions = new ArrayList<CorefMention>(); mentionMap = Generics.newHashMap(); CorefMention represents = null; for (Mention m : c.getCorefMentions()) { CorefMention men = new CorefMention(m, positions.get(m)); mentions.add(men); } Collections.sort(mentions, new CorefMentionComparator()); // Find representative mention for (CorefMention men : mentions) { IntPair position = new IntPair(men.sentNum, men.headIndex); if (!mentionMap.containsKey(position)) mentionMap.put(position, Generics.<CorefMention>newHashSet()); mentionMap.get(position).add(men); if (men.moreRepresentativeThan(represents)) { represents = men; } } representative = represents; }
From source file:knu.univ.lingvo.coref.Document.java
License:Open Source License
public Document() { positions = Generics.newHashMap(); mentionheadPositions = Generics.newHashMap(); roleSet = Generics.newHashSet();/*from w ww .j av a2 s . c om*/ corefClusters = Generics.newHashMap(); goldCorefClusters = null; allPredictedMentions = Generics.newHashMap(); allGoldMentions = Generics.newHashMap(); speakers = Generics.newHashMap(); speakerPairs = Generics.newHashSet(); incompatibles = TwoDimensionalSet.hashSet(); incompatibleClusters = TwoDimensionalSet.hashSet(); acronymCache = TwoDimensionalMap.hashMap(); }
From source file:knu.univ.lingvo.coref.Document.java
License:Open Source License
/** Mark twin mentions: heads of the mentions are matched */ private void findTwinMentionsRelaxed() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) { List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum); List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum); Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap(); Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap(); for (Mention g : golds) { goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g); if (!goldMentionHeadPositions.containsKey(g.headIndex)) { goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>()); }// www.j a v a 2 s . c o m goldMentionHeadPositions.get(g.headIndex).add(g); } List<Mention> remains = new ArrayList<Mention>(); for (Mention p : predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.containsKey(pos)) { Mention g = goldMentionPositions.get(pos); p.mentionID = g.mentionID; p.twinless = false; g.twinless = false; goldMentionHeadPositions.get(g.headIndex).remove(g); if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) { goldMentionHeadPositions.remove(g.headIndex); } } else remains.add(p); } for (Mention r : remains) { if (goldMentionHeadPositions.containsKey(r.headIndex)) { Mention g = goldMentionHeadPositions.get(r.headIndex).poll(); r.mentionID = g.mentionID; r.twinless = false; g.twinless = false; if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) { goldMentionHeadPositions.remove(g.headIndex); } } } } }
From source file:knu.univ.lingvo.coref.Document.java
License:Open Source License
/** Extract gold coref cluster information. */ public void extractGoldCorefClusters() { goldCorefClusters = Generics.newHashMap(); for (List<Mention> mentions : goldOrderedMentionsBySentence) { for (Mention m : mentions) { int id = m.goldCorefClusterID; if (id == -1) { throw new RuntimeException("No gold info"); }//from www .j av a2 s . com CorefCluster c = goldCorefClusters.get(id); if (c == null) { c = new CorefCluster(id); goldCorefClusters.put(id, c); } c.corefMentions.add(m); } } }
From source file:knu.univ.lingvo.coref.Document.java
License:Open Source License
/** Extract gold coref link information */ protected void extractGoldLinks() { // List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions(); List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>(); // position of each mention in the input matrix, by id Map<Integer, IntTuple> positions = Generics.newHashMap(); // positions of antecedents Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap(); for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) { for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) { Mention m = goldOrderedMentionsBySentence.get(i).get(j); int id = m.mentionID; IntTuple pos = new IntTuple(2); pos.set(0, i);// w w w .ja va 2 s. c o m pos.set(1, j); positions.put(id, pos); antecedents.put(id, new ArrayList<IntTuple>()); } } // SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence); for (List<Mention> mentions : goldOrderedMentionsBySentence) { for (Mention m : mentions) { int id = m.mentionID; IntTuple src = positions.get(id); assert (src != null); if (m.originalRef >= 0) { IntTuple dst = positions.get(m.originalRef); if (dst == null) { throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef); } // to deal with cataphoric annotation while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) { Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); m.originalRef = dstMention.originalRef; dstMention.originalRef = id; if (m.originalRef < 0) break; dst = positions.get(m.originalRef); } if (m.originalRef < 0) continue; // A B C: if A<-B, A<-C => make a link B<-C for (int k = dst.get(0); k <= src.get(0); k++) { for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) { if (k == dst.get(0) && l < dst.get(1)) continue; if (k == src.get(0) && l > src.get(1)) break; IntTuple missed = new IntTuple(2); missed.set(0, k); missed.set(1, l); if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) { antecedents.get(id).add(missed); links.add(new Pair<IntTuple, IntTuple>(src, missed)); } } } links.add(new Pair<IntTuple, IntTuple>(src, dst)); assert (antecedents.get(id) != null); antecedents.get(id).add(dst); List<IntTuple> ants = antecedents.get(m.originalRef); assert (ants != null); for (IntTuple ant : ants) { antecedents.get(id).add(ant); links.add(new Pair<IntTuple, IntTuple>(src, ant)); } } } } goldLinks = links; }