List of usage examples for edu.stanford.nlp.ling CoreLabel index
@Override public int index()
From source file:edu.illinois.cs.cogcomp.pipeline.handlers.StanfordParseHandler.java
License:Open Source License
private static CoreMap buildStanfordSentence(Constituent sentence, String rawText, int sentIndex, List<CoreLabel> stanfordTokens) { CoreMap stanfordSentence = new ArrayCoreMap(); CoreLabel firstTok = stanfordTokens.get(0); CoreLabel lastTok = stanfordTokens.get(stanfordTokens.size() - 1); stanfordSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, sentence.getStartSpan()); stanfordSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, sentence.getEndSpan()); stanfordSentence.set(CoreAnnotations.TokenBeginAnnotation.class, firstTok.index()); stanfordSentence.set(CoreAnnotations.TokenEndAnnotation.class, lastTok.index() + 1); // at-the-end // indexing? stanfordSentence.set(CoreAnnotations.TextAnnotation.class, rawText); stanfordSentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentIndex); stanfordSentence.set(CoreAnnotations.TokensAnnotation.class, stanfordTokens); return stanfordSentence; }
From source file:knu.univ.lingvo.coref.Mention.java
License:Open Source License
public String getPattern(List<CoreLabel> pTokens) { ArrayList<String> phrase_string = new ArrayList<String>(); String ne = ""; for (CoreLabel token : pTokens) { if (token.index() == headWord.index()) { phrase_string.add(token.lemma()); ne = ""; } else if ((token.lemma().equals("and") || StringUtils.isPunct(token.lemma())) && pTokens.size() > pTokens.indexOf(token) + 1 && pTokens.indexOf(token) > 0 && pTokens.get(pTokens.indexOf(token) + 1).ner() .equals(pTokens.get(pTokens.indexOf(token) - 1).ner())) { } else if (token.index() == headWord.index() - 1 && token.ner().equals(nerString)) { phrase_string.add(token.lemma()); ne = ""; } else if (!token.ner().equals("O")) { if (!token.ner().equals(ne)) { ne = token.ner();/*from w w w. j a va 2 s .c om*/ phrase_string.add("<" + ne + ">"); } } else { phrase_string.add(token.lemma()); ne = ""; } } return StringUtils.join(phrase_string); }
From source file:knu.univ.lingvo.coref.SieveCoreferenceSystem.java
License:Open Source License
/** * Print raw document for analysis//from w w w .j a va 2s . c o m */ public static void printRawDoc(Document document, boolean gold) throws FileNotFoundException { List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class); List<List<Mention>> allMentions; if (gold) { allMentions = document.goldOrderedMentionsBySentence; } else { allMentions = document.predictedOrderedMentionsBySentence; } // String filename = document.annotation.get() StringBuilder doc = new StringBuilder(); int previousOffset = 0; for (int i = 0; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); List<Mention> mentions = allMentions.get(i); List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class); String[] tokens = new String[t.size()]; for (CoreLabel c : t) { tokens[c.index() - 1] = c.word(); } if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { doc.append("\n"); } previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); Counter<Integer> startCounts = new ClassicCounter<Integer>(); Counter<Integer> endCounts = new ClassicCounter<Integer>(); Map<Integer, Set<Mention>> endMentions = Generics.newHashMap(); for (Mention m : mentions) { startCounts.incrementCount(m.startIndex); endCounts.incrementCount(m.endIndex); if (!endMentions.containsKey(m.endIndex)) { endMentions.put(m.endIndex, Generics.<Mention>newHashSet()); } endMentions.get(m.endIndex).add(m); } for (int j = 0; j < tokens.length; j++) { if (endMentions.containsKey(j)) { for (Mention m : endMentions.get(j)) { int corefChainId = (gold) ? m.goldCorefClusterID : m.corefClusterID; doc.append("]_").append(corefChainId); } } for (int k = 0; k < startCounts.getCount(j); k++) { if (doc.length() > 0 && doc.charAt(doc.length() - 1) != '[') { doc.append(" "); } doc.append("["); } if (doc.length() > 0 && doc.charAt(doc.length() - 1) != '[') { doc.append(" "); } doc.append(tokens[j]); } if (endMentions.containsKey(tokens.length)) { for (Mention m : endMentions.get(tokens.length)) { int corefChainId = (gold) ? m.goldCorefClusterID : m.corefClusterID; doc.append("]_").append(corefChainId); //append("_").append(m.mentionID); } } doc.append("\n"); } logger.fine(document.annotation.get(CoreAnnotations.DocIDAnnotation.class)); if (gold) { logger.fine("New DOC: (GOLD MENTIONS) =================================================="); } else { logger.fine("New DOC: (Predicted Mentions) =================================================="); } logger.fine(doc.toString()); }
From source file:main.java.parsers.StanfordParser.java
/** * Parses a given input text document using the Stanford CoreNLP parser. * //from w w w . j a v a 2s. c om * @param document * @throws java.io.UnsupportedEncodingException * @throws java.lang.InterruptedException */ public static void parse(Doc document) throws UnsupportedEncodingException, IOException, InterruptedException { // Initialize an Annotation with some text to be annotated. The text is the argument to the constructor. Annotation annotation = new Annotation(new String(document.text.getBytes("UTF-8"), "UTF-8")); // run all the selected Annotators on this text pipeline.annotate(annotation); // An Annotation is a Map and you can get and use the various analyses individually. List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); //returns if the annotation is empty. if (sentences == null || sentences.isEmpty()) return; //map linking token offsets with their tokens annotation from the Stanford tool. for (CoreMap sentence : sentences) { String sentenceStr = ""; int sentenceNum = sentence.get(CoreAnnotations.SentenceIndexAnnotation.class); Map<Integer, Integer> tokenNumStartOffset = document.sentenceTokenNumStartOffset.get(sentenceNum); if (tokenNumStartOffset == null) document.sentenceTokenNumStartOffset.put(sentenceNum, tokenNumStartOffset = new HashMap<>()); Map<Integer, List<String>> startOffsetSRLRoles = new TreeMap<>(); //extracting tokenized information from the stanford parser output. for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { sentenceStr += token.value() + " "; document.startOffsetIndexedWord.put(token.beginPosition(), new IndexedWord(token)); tokenNumStartOffset.put(token.index(), token.beginPosition()); startOffsetSRLRoles.put(token.beginPosition(), null); } //write the tokenized sentence to an output file FileOutputStream output = new FileOutputStream(Main.RESOURCES_DIR + "\\senna\\log.txt"); output.write(sentenceStr.getBytes()); //the semantic roles labels for the sentence are obtained by applying SENNA startOffsetSRLRoles = SENNASrl.getSRLRoles(startOffsetSRLRoles); //set the srl tags document.startOffsetSRLRoles.putAll(startOffsetSRLRoles); //parse tree of the sentence String stanfordParseTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class).toString(); ParseTree parseTree = new ParseTree(stanfordParseTree); parseTree.convertParseTree(); document.setSentenceParseTree(sentenceNum, parseTree); //dependency graph of the sentence SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); document.setSentenceDependencyGraph(sentenceNum, graph); } }
From source file:org.ets.research.nlp.stanford_thrift.coref.StanfordCorefThrift.java
License:Open Source License
private List<String> MUCStyleOutput(Annotation annotation) { Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class); Map<Integer, Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>>> mentionMap = new HashMap<Integer, Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>>>(); List<String> mucOutput = new ArrayList<String>(); for (CorefChain chain : corefChains.values()) { CorefChain.CorefMention ref = chain.getRepresentativeMention(); for (CorefChain.CorefMention mention : chain.getMentionsInTextualOrder()) { if (mention != ref) { // first add the mention itself Pair<CorefChain.CorefMention, CorefChain.CorefMention> mentions = new Pair<CorefChain.CorefMention, CorefChain.CorefMention>( mention, ref);/*from ww w . j ava 2s. c o m*/ if (mentionMap.containsKey(mention.sentNum)) { Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>> value = mentionMap .get(mention.sentNum); value.put(mention.startIndex, mentions); mentionMap.put(mention.sentNum, value); } else { Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>> startIndexToMentionMap = new HashMap<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>>(); startIndexToMentionMap.put(mention.startIndex, mentions); mentionMap.put(mention.sentNum, startIndexToMentionMap); } // now make sure the representative is there (TODO make this code less redundant) Pair<CorefChain.CorefMention, CorefChain.CorefMention> refMention = new Pair<CorefChain.CorefMention, CorefChain.CorefMention>( ref, ref); if (mentionMap.containsKey(ref.sentNum)) { Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>> value = mentionMap .get(ref.sentNum); value.put(ref.startIndex, refMention); mentionMap.put(ref.sentNum, value); } else { Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>> startIndexToMentionMap = new HashMap<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>>(); startIndexToMentionMap.put(ref.startIndex, refMention); mentionMap.put(ref.sentNum, startIndexToMentionMap); } } } } List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (Integer sentenceNum : mentionMap.keySet()) { CoreMap currentSentence = sentences.get(sentenceNum - 1); Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>> currentSetOfMentions = mentionMap .get(sentenceNum); CorefChain.CorefMention lastMention = null; String outputString = ""; for (CoreLabel token : currentSentence.get(CoreAnnotations.TokensAnnotation.class)) { if (currentSetOfMentions.containsKey(token.index())) { lastMention = currentSetOfMentions.get(token.index()).first(); CorefChain.CorefMention ref = currentSetOfMentions.get(token.index()).second(); outputString += "<COREF ID=\"" + lastMention.mentionID + "\""; if (lastMention.mentionID != ref.mentionID) { outputString += " REF=\"" + ref.mentionID + "\""; } outputString += ">"; } if (lastMention != null && token.index() == lastMention.endIndex) { outputString += "</COREF> "; } outputString += token.word() + " "; } mucOutput.add(CoreNLPThriftUtil.closeHTMLTags(outputString.replaceAll(" </", "</"))); } return mucOutput; }
From source file:org.sam_agent.csparser.ContinuousParser.java
License:Open Source License
public String stringify(CoreMap words) { List<String> posList = new ArrayList<String>(); List<String> posMap = new ArrayList<String>(); List<String> lemmaMap = new ArrayList<String>(); List<String> timexMap = new ArrayList<String>(); Map<String, List<String>> timexIdMap = new HashMap<String, List<String>>(); for (CoreLabel token : words.get(CoreAnnotations.TokensAnnotation.class)) { String word = token.get(CoreAnnotations.TextAnnotation.class); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String wordToken = esc(word) + "-" + token.index(); posList.add(String.format("{\"token\":\"%s\",\"pos\":\"%s\"}", esc(word), esc(pos))); posMap.add(String.format("\"%s\":\"%s\"", wordToken, esc(pos))); lemmaMap.add(String.format("\"%s\":\"%s\"", wordToken, esc(lemma))); Timex t = token.get(TimeAnnotations.TimexAnnotation.class); if (t != null) { String tid = t.tid(); if (timexIdMap.containsKey(tid)) { timexIdMap.get(tid).add("\"" + wordToken + "\""); continue; }//from w ww . j a v a 2 s.co m List<String> tokens = new ArrayList<String>(); tokens.add("\"" + wordToken + "\""); timexIdMap.put(tid, tokens); List<String> attributesList = new ArrayList<String>(); Element xml = t.toXmlElement(); NamedNodeMap attrs = xml.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Node item = attrs.item(i); String name = item.getNodeName(); String value = item.getNodeValue(); attributesList.add(String.format("\"%s\":\"%s\"", name, value)); } String json = String.format("\"%s\":{%s}", t.tid(), String.join(",", attributesList)); timexMap.add(json); } } String posListJSON = "[" + String.join(", ", posList) + "]"; String posMapJSON = "{" + String.join(", ", posMap) + "}"; String lemmaMapJSON = "{" + String.join(", ", lemmaMap) + "}"; String timexMapJSON = "{" + String.join(", ", timexMap) + "}"; List<String> temp = new ArrayList<String>(); for (String tid : timexIdMap.keySet()) { temp.add(String.format("\"%s\":[%s]", tid, String.join(",", timexIdMap.get(tid)))); } String timexIdMapJSON = "{" + String.join(",", temp) + "}"; return String.format("\"pos\":{\"map\":%s,\"list\":%s},\"lemma\":%s,\"timex\":%s,\"timexGroups\":%s", posMapJSON, posListJSON, lemmaMapJSON, timexMapJSON, timexIdMapJSON); }
From source file:semRewrite.Interpreter.java
License:Open Source License
/** ************************************************************* * @param tokens - List of CoreLabel tokens representing a sentence/input * @return Map of token position -> POS * ex. Mary-1 -> NNP//from www .ja v a2 s . c o m * drives-2 -> VBZ * the-3 -> DT * car-4 -> NN */ private static Map<String, String> getPartOfSpeechList(List<CoreLabel> tokens, ClauseSubstitutor substitutor) { Map<String, String> posMap = Maps.newHashMap(); for (CoreLabel token : tokens) { CoreLabelSequence seq = substitutor.containsKey(token) ? substitutor.getGrouped(token) : CoreLabelSequence.from(token); for (CoreLabel label : seq.getLabels()) { posMap.put(label.originalText() + "-" + label.index(), label.tag()); } } return posMap; }
From source file:semRewrite.substitutor.CoreLabelSequence.java
License:Open Source License
/** ************************************************************* * Change the value() of each CoreLabel to be all caps *///from w w w. ja v a 2 s. c o m public semRewrite.substitutor.CoreLabelSequence toUpperCase() { //System.out.println("CoreLabelSequence.toUpperCase(): labels: " + labels); List<CoreLabel> lcl = new ArrayList<>(); for (CoreLabel cl : labels) { CoreLabel newcl = new CoreLabel(); newcl.setValue(cl.value().toUpperCase()); newcl.setIndex(cl.index()); lcl.add(newcl); } semRewrite.substitutor.CoreLabelSequence cls = new semRewrite.substitutor.CoreLabelSequence(lcl); //System.out.println("CoreLabelSequence.toUpperCase(): cls: " + cls); return cls; }
From source file:semRewrite.substitutor.StanfordCorefSubstitutor.java
License:Open Source License
/** ************************************************************** */// w ww . ja v a 2 s . com private void initialize(Annotation document) { List<CoreLabel> labels = document.get(TokensAnnotation.class); Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); Map<semRewrite.substitutor.CoreLabelSequence, semRewrite.substitutor.CoreLabelSequence> collectedGroups = Maps .newHashMap(); for (CoreLabel label : labels) { List<CorefMention> mentions = getMentions(label, corefChains); if (mentions.size() > 1) { if (!ignorablePronouns.contains(label.originalText())) { int index = label.index(); int sentenceIdx = 1 + label.sentIndex(); CorefMention firstMention = findRootMention(mentions); if (sentenceIdx != firstMention.sentNum || index < firstMention.startIndex || index >= firstMention.endIndex) { String masterTag = label.tag(); if (isSubstitutablePronoun(label)) { masterTag = ""; } List<CoreLabel> singleSentence = getSentenceTokens(document, firstMention.sentNum - 1); semRewrite.substitutor.CoreLabelSequence key = extractTextWithSameTag(singleSentence, firstMention, masterTag); if (!key.isEmpty()) { collectedGroups.put(new semRewrite.substitutor.CoreLabelSequence(label), key); } } } } } addGroups(collectedGroups); }
From source file:semRewrite.substitutor.StanfordCorefSubstitutor.java
License:Open Source License
/** ************************************************************* *//*w w w . j av a 2 s .co m*/ private List<CorefMention> getMentions(final CoreLabel label, Map<Integer, CorefChain> corefs) { List<CorefMention> mentions = ImmutableList.of(); Integer corefClusterId = label.get(CorefClusterIdAnnotation.class); while (mentions.size() <= 1 && corefClusterId != null && corefClusterId.compareTo(0) > 0) { if (corefs.containsKey(corefClusterId)) { List<CorefMention> candidateMentions = corefs.get(corefClusterId).getMentionsInTextualOrder(); boolean areMentionsContainLabel = candidateMentions.stream().anyMatch( mention -> mention.sentNum == label.sentIndex() + 1 && mention.startIndex == label.index()); if (areMentionsContainLabel) { mentions = candidateMentions; } } corefClusterId = corefClusterId - 1; } return mentions; }