List of usage examples for edu.stanford.nlp.ling CoreLabel ner
@Override
public String ner()
From source file:ca.ualberta.exemplar.core.CleanPrefixAnnotator.java
License:Open Source License
@Override public void annotate(Annotation document) { if (document.has(SentencesAnnotation.class)) { for (CoreMap sentence : document.get(SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); int numTokens = 0, numPrefixParts = 0; // Assumption: prefix is at max 10 tokens for (int i = 0; i < Math.min(tokens.size(), 10); i++) { CoreLabel token = tokens.get(i); String tokenText = token.get(TextAnnotation.class); if (tokenText != null && numTokens > 0 && (tokenText.equals("--") || tokenText.equals(":"))) { // Assumption: if more than half the tokens are a date/location/number it's a prefix double fraction = (double) numPrefixParts / (double) numTokens; if (fraction > 0.5) { CoreLabel nextToken = tokens.get(i + 1); String before = document.get(TextAnnotation.class).substring(0, nextToken.beginPosition()); nextToken.set(BeforeAnnotation.class, before); sentence.set(TokensAnnotation.class, tokens.subList(i + 1, tokens.size())); //System.out.println("Removed Prefix: " + before); }/*from ww w. j a va2 s . c o m*/ break; } numTokens++; String neTag = token.ner(); if (neTag != null && (neTag.equals("DATE") || neTag.equals("LOCATION") || neTag.equals("NUMBER") || neTag.equals("ORDINAL"))) { numPrefixParts++; } } } } }
From source file:edu.ucla.cs.scai.qa.questionclassifier.SyntacticTreeNode.java
public SyntacticTreeNode(Tree t, ArrayList<CoreLabel> tokens, SyntacticTreeNode parent) throws Exception { this.parent = parent; value = t.value();/*from ww w. j a v a 2 s. c o m*/ if (t.isLeaf()) { CoreLabel c = tokens.remove(0); begin = c.beginPosition(); end = c.endPosition(); if (c == null) { throw new Exception("Mapping between TreeNode and CoreLabel not found"); } else { lemma = c.lemma(); ner = c.ner(); //System.out.println(value + " -> " + c.value()); if (!value.equals(c.value())) { throw new Exception("Different words have been matched!"); } } } else { boolean hasNPchildren = false; boolean hasWHNPchildren = false; boolean hasQPchildren = false; begin = Integer.MAX_VALUE; end = Integer.MIN_VALUE; for (Tree c : t.children()) { SyntacticTreeNode child = new SyntacticTreeNode(c, tokens, this); children.add(child); if (child.value.equals("NP")) { hasNPchildren = true; } else if (child.value.equals("QP")) { hasQPchildren = true; } else if (child.value.equals("WHNP")) { hasWHNPchildren = true; } begin = Math.min(begin, child.begin); end = Math.max(end, child.end); } if (value.equals("NP")) { if (hasNPchildren) { npCompound = true; } else if (hasQPchildren) { npQp = true; } else { npSimple = true; } } else if (value.equals("WHNP")) { //can a WHNP node have QP children? if (hasNPchildren || hasWHNPchildren) { whnpCompound = true; } else if (!hasQPchildren) { whnpSimple = true; } } } }
From source file:ims.cs.corenlp.TokenAligner.java
License:Open Source License
/** * Combines my token and a CoreNlp token using predicted information * @param tok//from w w w . j a v a 2 s.c o m * @param cl * @param currentCoreNlpSentenceIndex * @return */ public static Token combineTokensPred(Token tok, CoreLabel cl, int currentCoreNlpSentenceIndex) { Token combined = new Token(tok); combined.predText = cl.word(); combined.predLemma = cl.lemma(); combined.predPosition = -1; /* will be determined by document aligner */ combined.predPosTag = cl.tag(); combined.predSentencePosition = currentCoreNlpSentenceIndex; combined.predNer = Helper.translateNer(cl.ner()); combined.predByteCount = new ByteCount(cl.beginPosition(), cl.endPosition()); return combined; }
From source file:knu.univ.lingvo.coref.Mention.java
License:Open Source License
public List<CoreLabel> nerTokens() { if (nerString == null || "O".equals(nerString)) return null; int start = headIndex - startIndex; int end = headIndex - startIndex + 1; while (start > 0) { CoreLabel prev = originalSpan.get(start - 1); if (nerString.equals(prev.ner())) { start--;//from ww w . j a va2s . co m } else { break; } } while (end < originalSpan.size()) { CoreLabel next = originalSpan.get(end); if (nerString.equals(next.ner())) { end++; } else { break; } } return originalSpan.subList(start, end); }
From source file:knu.univ.lingvo.coref.Mention.java
License:Open Source License
public String getPattern(List<CoreLabel> pTokens) { ArrayList<String> phrase_string = new ArrayList<String>(); String ne = ""; for (CoreLabel token : pTokens) { if (token.index() == headWord.index()) { phrase_string.add(token.lemma()); ne = ""; } else if ((token.lemma().equals("and") || StringUtils.isPunct(token.lemma())) && pTokens.size() > pTokens.indexOf(token) + 1 && pTokens.indexOf(token) > 0 && pTokens.get(pTokens.indexOf(token) + 1).ner() .equals(pTokens.get(pTokens.indexOf(token) - 1).ner())) { } else if (token.index() == headWord.index() - 1 && token.ner().equals(nerString)) { phrase_string.add(token.lemma()); ne = ""; } else if (!token.ner().equals("O")) { if (!token.ner().equals(ne)) { ne = token.ner();//from w w w .j a va 2 s . c o m phrase_string.add("<" + ne + ">"); } } else { phrase_string.add(token.lemma()); ne = ""; } } return StringUtils.join(phrase_string); }
From source file:knu.univ.lingvo.coref.Mention.java
License:Open Source License
private static List<String> getContextHelper(List<? extends CoreLabel> words) { List<List<CoreLabel>> namedEntities = new ArrayList<List<CoreLabel>>(); List<CoreLabel> ne = new ArrayList<CoreLabel>(); String previousNEType = ""; int previousNEIndex = -1; for (int i = 0; i < words.size(); i++) { CoreLabel word = words.get(i); if (!word.ner().equals("O")) { if (!word.ner().equals(previousNEType) || previousNEIndex != i - 1) { ne = new ArrayList<CoreLabel>(); namedEntities.add(ne);/*w w w . j av a 2s.co m*/ } ne.add(word); previousNEType = word.ner(); previousNEIndex = i; } } List<String> neStrings = new ArrayList<String>(); Set<String> hs = Generics.newHashSet(); for (List<CoreLabel> namedEntity : namedEntities) { String ne_str = StringUtils.joinWords(namedEntity, " "); hs.add(ne_str); } neStrings.addAll(hs); return neStrings; }
From source file:main.java.spelementex.Annotator.java
public void writeCrfRolesData(Map<Integer, Map<Integer, CoreLabel>> sentenceStartOffsetParserLabel, Map<Integer, SpatialElement> startOffsetSpatialElement, FileOutputStream output) throws IOException { SpatialElement se = null;/*from ww w. jav a 2 s .c om*/ int[] roleOffsets; String featureStr = ""; for (int sentence : sentenceStartOffsetParserLabel.keySet()) { Map<Integer, CoreLabel> startOffsetParserLabel = sentenceStartOffsetParserLabel.get(sentence); roleOffsets = new int[2]; for (int startOffset : startOffsetParserLabel.keySet()) { CoreLabel token = startOffsetParserLabel.get(startOffset); // this is the text of the token String word = token.word(); //generate crf feature CrfFeature crfFeat = new CrfFeature(word, token.lemma(), token.tag(), token.ner()); crfFeat.setUniFV(); se = startOffsetSpatialElement.containsKey(startOffset) ? startOffsetSpatialElement.get(startOffset) : startOffset >= roleOffsets[1] ? SpatialElement.getSE(word, startOffset, startOffsetSpatialElement) : se; boolean isMotionSignal = se == null ? false : se.isMotionSignal(); boolean isMotionSignalOnly = se == null ? false : se.isMotionSignalOnly(); if (se == null) crfFeat.setType(); else crfFeat.setType(se.getNonMotionSignalType(), isMotionSignal, isMotionSignalOnly); featureStr += crfFeat.toString() + "\n"; } output.write((featureStr + "\n").getBytes()); featureStr = ""; } }
From source file:main.java.spelementex.Annotator.java
/** * Writes a single test data file to use as input for CRF++ toolkit. * /*w w w .java 2 s . c om*/ * @param sentenceStartOffsetParserLabel * @param output * @throws IOException */ public void writeCrfData(Map<Integer, Map<Integer, CoreLabel>> sentenceStartOffsetParserLabel, FileOutputStream output) throws IOException { String featureStr = ""; for (int sentence : sentenceStartOffsetParserLabel.keySet()) { Map<Integer, CoreLabel> startOffsetParserLabel = sentenceStartOffsetParserLabel.get(sentence); for (int startOffset : startOffsetParserLabel.keySet()) { CoreLabel token = startOffsetParserLabel.get(startOffset); //generate crf feature CrfFeature crfFeat = new CrfFeature(token.word(), token.lemma(), token.tag(), token.ner()); crfFeat.setUniFV(); featureStr += crfFeat.toString() + "\n"; } output.write((featureStr + "\n").getBytes()); featureStr = ""; } }
From source file:main.java.spelementex.Trainer.java
/** * Writes labelled training data to use as input for CRF++ toolkit. * Writes to two separate training data files: 1) only with labels for * motion-signal spatial entity tokens; and 2) with labels for all * spatial entity tokens except motion-signals. * /*from w w w.j ava 2s . c o m*/ * @param seOutput * @param msOutput * @throws SAXException * @throws ParserConfigurationException * @throws IOException */ public void writeTypeLabelledCrfData(FileOutputStream seOutput, FileOutputStream msOutput) throws SAXException, ParserConfigurationException, IOException { int[] seOffsets; int[] msOffsets; for (File file : trainFiles) { System.out.println("writing labelled CRF data for " + file.getPath()); //parses the spatial entity annotations from input xml files Doc document = new Doc(); Main.dp.parseXmlAnnotationsFile(file.toString(), document, true, false); //map which stores all spatial elements Map<Integer, SpatialElement> startOffsetSpatialElement = new HashMap<>( document.getStartOffsetSpatialElement()); //map to store the sentence-wise nlp info of the text document //obtained from the Stanford CoreNLP parser Map<Integer, Map<Integer, CoreLabel>> sentenceStartOffsetParserLabel = StanfordParser.parse(document); SpatialElement se = null; String seFeatureStr = ""; String msFeatureStr = ""; for (int sentence : sentenceStartOffsetParserLabel.keySet()) { Map<Integer, CoreLabel> startOffsetParserLabel = sentenceStartOffsetParserLabel.get(sentence); seOffsets = new int[2]; msOffsets = new int[2]; for (int startOffset : startOffsetParserLabel.keySet()) { CoreLabel token = startOffsetParserLabel.get(startOffset); // this is the text of the token String word = token.word(); //generate crf feature CrfFeature crfFeat = new CrfFeature(word, token.lemma(), token.tag(), token.ner()); crfFeat.setUniFV(); String featureStr = crfFeat.toString(); seFeatureStr += featureStr; se = startOffsetSpatialElement.containsKey(startOffset) ? startOffsetSpatialElement.get(startOffset) : startOffset >= seOffsets[1] ? SpatialElement.getSE(word, startOffset, startOffsetSpatialElement) : se; boolean isMotionSignal = se == null ? false : se.isMotionSignal(); boolean isMotionSignalOnly = se == null ? false : se.isMotionSignalOnly(); if (se == null || isMotionSignalOnly) seFeatureStr += " O\n"; else if (seOffsets[1] != se.getEnd()) { seOffsets[0] = startOffset; int difference = startOffset - se.getStart(); seOffsets[1] = se.getEnd() + difference; } if (se != null && !isMotionSignalOnly) { if (startOffset == seOffsets[0]) seFeatureStr += " B-" + se.getNonMotionSignalType() + "\n"; else if (startOffset < seOffsets[1]) seFeatureStr += " I-" + se.getNonMotionSignalType() + "\n"; } msFeatureStr += featureStr; if (se == null || !isMotionSignal) msFeatureStr += " O\n"; else if (msOffsets[1] != se.getEnd()) { msOffsets[0] = startOffset; int difference = startOffset - se.getStart(); msOffsets[1] = se.getEnd() + difference; } if (se != null && isMotionSignal) { if (startOffset == msOffsets[0]) msFeatureStr += " B-MOTION_SIGNAL\n"; else if (startOffset < msOffsets[1]) msFeatureStr += " I-MOTION_SIGNAL\n"; } } seOutput.write((seFeatureStr + "\n").getBytes()); seFeatureStr = ""; msOutput.write((msFeatureStr + "\n").getBytes()); msFeatureStr = ""; } } }
From source file:main.java.spelementex.Trainer.java
/** * Writes labeled training data to use as input for CRF++ toolkit. * For each role, writes to two separate training data files: * 1) with labels for the role and other roles it doesn't overlap with; * 2) with labels for the roles and it does and doesn't overlap with except itself. * // w w w. j a v a 2s . c o m * @param dataDir * @param train * @throws SAXException * @throws ParserConfigurationException * @throws IOException */ public void writeRoleLabelledCrfData(String dataDir, boolean train) throws SAXException, ParserConfigurationException, IOException { int[] roleOffsets; int[] otherOffsets; String ext = train ? "Train" : "Test"; Set<String> allRolesSet = new HashSet<>(SpatialRelation.ROLE_OVERLAP_ROLES_MAP.keySet()); for (File file : trainFiles) { System.out.println("writing labelled CRF data for " + file.getPath()); //parses the spatial entity annotations from input xml files Doc document = new Doc(); Main.dp.parseXmlAnnotationsFile(file.toString(), document, true, true); //map which stores all spatial elements Map<Integer, SpatialElement> startOffsetSpatialElement = new HashMap<>( document.getStartOffsetSpatialElement()); //map to store the sentence-wise nlp info of the text document //obtained from the Stanford CoreNLP parser Map<Integer, Map<Integer, CoreLabel>> sentenceStartOffsetParserLabel = StanfordParser.parse(document); for (String role : SpatialRelation.ROLE_OVERLAP_ROLES_MAP.keySet()) { Set<String> overlapRoles = new HashSet<>(SpatialRelation.ROLE_OVERLAP_ROLES_MAP.get(role)); Set<String> nonOverlapRoles = new HashSet<>(allRolesSet); nonOverlapRoles.removeAll(overlapRoles); nonOverlapRoles.remove(role); FileOutputStream output = new FileOutputStream(dataDir + "\\" + role + ext + ".txt", true); FileOutputStream outputOther = overlapRoles.contains("null") ? null : new FileOutputStream(dataDir + "\\" + role + ext + "Other.txt", true); SpatialElement se = null; String roleFeatureStr = ""; String otherFeatureStr = ""; List<Integer> nonOverlapRolesStartOffsets = document.getElementRolesStartOffsets(nonOverlapRoles); List<Integer> roleStartOffsets = !document.getElementRoleStartOffsets().containsKey(role) ? new ArrayList<>() : document.getElementRoleStartOffsets().get(role); roleStartOffsets.addAll(nonOverlapRolesStartOffsets); List<Integer> overlapRolesStartOffsets = outputOther != null ? document.getElementRolesStartOffsets(overlapRoles) : null; if (overlapRolesStartOffsets != null) overlapRolesStartOffsets.addAll(nonOverlapRolesStartOffsets); for (int sentence : sentenceStartOffsetParserLabel.keySet()) { Map<Integer, CoreLabel> startOffsetParserLabel = sentenceStartOffsetParserLabel.get(sentence); roleOffsets = new int[2]; otherOffsets = new int[2]; for (int startOffset : startOffsetParserLabel.keySet()) { CoreLabel token = startOffsetParserLabel.get(startOffset); // this is the text of the token String word = token.word(); //generate crf feature CrfFeature crfFeat = new CrfFeature(word, token.lemma(), token.tag(), token.ner()); crfFeat.setUniFV(); se = startOffsetSpatialElement.containsKey(startOffset) ? startOffsetSpatialElement.get(startOffset) : startOffset >= roleOffsets[1] ? SpatialElement.getSE(word, startOffset, startOffsetSpatialElement) : se; boolean isMotionSignal = se == null ? false : se.isMotionSignal(); boolean isMotionSignalOnly = se == null ? false : se.isMotionSignalOnly(); if (se == null) crfFeat.setType(); else crfFeat.setType(se.getNonMotionSignalType(), isMotionSignal, isMotionSignalOnly); String featureStr = crfFeat.toString(); roleFeatureStr += featureStr; String roleLabel = se == null ? "O" : se.getRole(role, nonOverlapRoles); if (se == null || roleLabel.equals("O")) roleFeatureStr += " O\n"; else if (roleOffsets[1] != se.getEnd()) { roleOffsets[0] = startOffset; int difference = startOffset - se.getStart(); roleOffsets[1] = se.getEnd() + difference; } if (!roleLabel.equals("O")) { if (startOffset == roleOffsets[0]) roleFeatureStr += " B-" + roleLabel + "\n"; else if (startOffset < roleOffsets[1]) roleFeatureStr += " I-" + roleLabel + "\n"; } if (outputOther == null) continue; otherFeatureStr += featureStr; roleLabel = se == null ? "O" : se.getRole(overlapRoles, nonOverlapRoles); if (se == null || roleLabel.equals("O")) otherFeatureStr += " O\n"; else if (otherOffsets[1] != se.getEnd()) { otherOffsets[0] = startOffset; int difference = startOffset - se.getStart(); otherOffsets[1] = se.getEnd() + difference; } if (!roleLabel.equals("O")) { if (startOffset == otherOffsets[0]) otherFeatureStr += " B-" + roleLabel + "\n"; else if (startOffset < otherOffsets[1]) otherFeatureStr += " I-" + roleLabel + "\n"; } } output.write((roleFeatureStr + "\n").getBytes()); roleFeatureStr = ""; if (outputOther != null) { outputOther.write((otherFeatureStr + "\n").getBytes()); otherFeatureStr = ""; } } } } }