List of usage examples for edu.stanford.nlp.ling CoreLabel tag
@Override
public String tag()
From source file:SentencePair.java
License:Open Source License
private void createSentence(String text, List<POSTaggedToken> sentence) { Annotation d = new Annotation(text); nlp.annotate(d);/*from w ww. j a v a2 s. c om*/ for (CoreMap ss : d.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : ss.get(CoreAnnotations.TokensAnnotation.class)) { sentence.add(new POSTaggedToken(token.toString(), translateTag(token.tag()))); } } }
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private Tree extractBestParse(int goal, int start, int end) { // find source of inside score // no backtraces so we can speed up the parsing for its primary use double bestScore = iScore[start][end][goal]; double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal]) : bestScore;// w w w .j a v a 2 s . c om String goalStr = stateIndex.get(goal); // check tags if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) { if (op.testOptions.maxSpanForTags > 1) { Tree wordNode = null; if (sentence != null) { StringBuilder word = new StringBuilder(); for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); } } wordNode = tf.newLeaf(word.toString()); } else if (lr != null) { List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end); for (LatticeEdge edge : latticeEdges) { IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex); float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null); if (matches(bestScore, tagScore + (float) edge.weight)) { wordNode = tf.newLeaf(edge.word); if (wordNode.label() instanceof CoreLabel) { CoreLabel cl = (CoreLabel) wordNode.label(); cl.setBeginPosition(start); cl.setEndPosition(end); } break; } } if (wordNode == null) { throw new RuntimeException( "could not find matching word from lattice in parse reconstruction"); } } else { throw new RuntimeException("attempt to get word when sentence and lattice are null!"); } Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (originalTags[start] != null) { tagNode.label().setValue(originalTags[start].tag()); } return tagNode; } else { // normal lexicon is single words case IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr)); String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) { // return a pre-terminal tree CoreLabel terminalLabel = getCoreLabel(start); Tree wordNode = tf.newLeaf(terminalLabel); Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (terminalLabel.tag() != null) { tagNode.label().setValue(terminalLabel.tag()); } if (tagNode.label() instanceof HasTag) { ((HasTag) tagNode.label()).setTag(tagNode.label().value()); } return tagNode; } } } // check binaries first for (int split = start + 1; split < end; split++) { for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) { BinaryRule br = binaryI.next(); double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild] + lex.score(br, start, end, split); boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]); matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (matches) { // build binary split Tree leftChildTree = extractBestParse(br.leftChild, start, split); Tree rightChildTree = extractBestParse(br.rightChild, split, end); List<Tree> children = new ArrayList<Tree>(); children.add(leftChildTree); children.add(rightChildTree); Tree result = tf.newTreeNode(goalStr, children); result.setScore(score); // System.err.println(" Found Binary node: "+result); return result; } } } // check unaries // note that even though we parse with the unary-closed grammar, we can // extract the best parse with the non-unary-closed grammar, since all // the intermediate states in the chain must have been built, and hence // we can exploit the sparser space and reconstruct the full tree as we go. // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) { for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) { UnaryRule ur = unaryI.next(); // System.err.println(" Trying " + ur + " dtr score: " + iScore[start][end][ur.child]); double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end); boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / wordsInSpan[start][end][ur.child]; matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (ur.child != ur.parent && matches) { // build unary Tree childTree = extractBestParse(ur.child, start, end); Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree)); // System.err.println(" Matched! Unary node: "+result); result.setScore(score); return result; } } System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr); return null; }
From source file:coreferenceresolver.util.StanfordUtil.java
public void init(boolean simpleInit) throws FileNotFoundException, IOException { String outPosFilePath = "./input.txt.pos"; FileWriter fw = new FileWriter(new File(outPosFilePath)); BufferedWriter bw = new BufferedWriter(fw); props = new Properties(); if (simpleInit) { props.put("annotators", "tokenize, ssplit, pos, parse"); } else {/*from w w w. j a va 2 s .c o m*/ props.put("annotators", "tokenize, ssplit, pos, parse, sentiment"); } pipeline = new StanfordCoreNLP(props); reviews = new ArrayList<>(); FileReader fileReader = new FileReader(documentFile); BufferedReader bufferedReader = new BufferedReader(fileReader); String reviewLine; int reviewId = 0; int sentenceId; //read input file line by line and count the number sentences of each lines while ((reviewLine = bufferedReader.readLine()) != null) { sentenceId = 0; Review newReview = new Review(); //Add to reviews list newReview.setRawContent(reviewLine); // create an empty Annotation just with the given text document = new Annotation(reviewLine); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); //Begin extracting from paragraphs for (CoreMap sentence : sentences) { int sentenceOffsetBegin = sentence.get(CharacterOffsetBeginAnnotation.class); int sentenceOffsetEnd = sentence.get(CharacterOffsetEndAnnotation.class); Sentence newSentence = new Sentence(); newSentence.setReviewId(reviewId); newSentence.setRawContent(sentence.toString()); newSentence.setOffsetBegin(sentenceOffsetBegin); newSentence.setOffsetEnd(sentenceOffsetEnd); if (!simpleInit) { int sentimentLevel = RNNCoreAnnotations .getPredictedClass(sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class)); newSentence.setSentimentLevel(sentimentLevel); //Dependency Parsing SemanticGraph collCCDeps = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); Collection<TypedDependency> typedDeps = collCCDeps.typedDependencies(); newSentence.setDependencies(typedDeps); } List<Tree> sentenceTreeLeaves = sentence.get(TreeCoreAnnotations.TreeAnnotation.class).getLeaves(); int i = 0; for (CoreLabel token : sentence.get(TokensAnnotation.class)) { Token newToken = new Token(); Tree tokenTree = sentenceTreeLeaves.get(i); newToken.setTokenTree(tokenTree); String word = token.get(TextAnnotation.class); newToken.setWord(word); String pos = token.get(PartOfSpeechAnnotation.class); newToken.setPOS(pos); int offsetBegin = token.get(CharacterOffsetBeginAnnotation.class); newToken.setOffsetBegin(offsetBegin); int offsetEnd = token.get(CharacterOffsetEndAnnotation.class); newToken.setOffsetEnd(offsetEnd); if (!simpleInit) { //Check NP relative clause Tree twoLevelsAncestor = tokenTree.ancestor(2, sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); if (twoLevelsAncestor.value().equals("WHNP") && !word.toLowerCase().equals("who") && !word.toLowerCase().equals("what")) { newToken.setRelativePronoun(true); } //Calculate sentiment for this token int newTokenSentiment = Util.retrieveOriginalSentiment(newToken.getWord()); newToken.setSentimentOrientation(newTokenSentiment, newSentence.getDependencies()); } newSentence.addToken(newToken); bw.write(token.word() + "/" + token.tag() + " "); ++i; } bw.newLine(); if (!simpleInit) { //Check if this sentence contains a comparative indicator. //If yes, it is a comparative sentence. Identify which NP is superior or inferior in this sentence List<Token> comparativeTokens = FeatureExtractor.findComparativeIndicator(newSentence, null, null); //TODO //Check special comparative samples if (!comparativeTokens.isEmpty()) { newSentence.initComparatives(comparativeTokens); } } newReview.addSentence(newSentence); ++sentenceId; } bw.write("./."); bw.newLine(); reviews.add(newReview); ++reviewId; } bw.close(); }
From source file:count_dep.Count_dep.java
private void Countdependencies() { // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); int normal = 0; int[] tag = new int[10]; for (int i = 0; i < events.size(); i++) { Event e = events.get(i);//w ww. ja v a 2 s.c o m // read some text in the text variable String text = e.span;// Add your text here! // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); assert sentences.size() == 1; CoreMap sentence = sentences.get(0); List<CoreLabel> get = sentence.get(TokensAnnotation.class); for (CoreLabel cl : get) { if (cl.word().equals(e.trigger)) { if (cl.tag().length() >= 2 && "NN".equals(cl.tag().substring(0, 2))) { System.out.println("NN " + cl.word() + " " + cl.tag() + " " + e.span + " " + e.filename); tag[0]++; } else if (cl.tag().length() >= 2 && "VB".equals(cl.tag().substring(0, 2))) { System.out.println("VB " + cl.word() + " " + cl.tag() + " " + e.span + " " + e.filename); tag[1]++; } else if (cl.tag().length() >= 2 && "JJ".equals(cl.tag().substring(0, 2))) { tag[2]++; } else if (cl.tag().length() >= 2 && "PR".equals(cl.tag().substring(0, 2))) { tag[3]++; } else if (cl.tag().length() >= 2 && "DT".equals(cl.tag().substring(0, 2))) { tag[4]++; } else { // System.out.println(cl.word() + " " + cl.tag()+" "+e.span+" "+e.filename); } } } // this is the Stanford dependency graph of the current sentence SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); LinkedList<Event> Extracted_Events = GetEvents(dependencies, sentence); for (Event ee : Extracted_Events) { if (ee.trigger.equals(e.trigger)) { boolean allmatched = true; for (EventArgument s : e.arguments) { boolean ok = false; for (EventArgument s2 : ee.arguments) { if (s.data.contains(s2.data)) { ok = true; break; } } if (!ok) { allmatched = false; break; } } if (allmatched) { normal++; break; } else { System.out.println(e.trigger); System.out.println(e.span + " " + e.filename); System.out.println("gold slots"); for (int goldargu = 0; goldargu < e.arguments.size(); goldargu++) { System.out.print(goldargu + " " + e.arguments.get(goldargu) + " "); } System.out.println(); System.out.println("extracted slots"); for (int argu = 0; argu < ee.arguments.size(); argu++) { System.out.print(ee.arguments.get(argu) + " "); } System.out.println(); System.out.println(); } } } if (i % 50 == 0) { System.out.println(1.0 * i / events.size()); } } System.out.println(1.0 * normal / events.size()); // System.out.println("NN: " + 1.0 * tag[0] / events.size()); // System.out.println("VB: " + 1.0 * tag[1] / events.size()); // System.out.println("JJ: " + 1.0 * tag[2] / events.size()); // System.out.println("PRP: " + 1.0 * tag[3] / events.size()); // System.out.println("DT: " + 1.0 * tag[4] / events.size()); }
From source file:count_dep.Count_dep.java
public LinkedList<Event> GetEvents(SemanticGraph dependencies, CoreMap sentence) { LinkedList<Event> res = new LinkedList<>(); LinkedList<IndexedWord> roots = new LinkedList<>(); List<CoreLabel> words = sentence.get(TokensAnnotation.class); List<GrammaticalRelation> senserel = new LinkedList<>(); senserel.add(GrammaticalRelation.valueOf("nsubj")); senserel.add(GrammaticalRelation.valueOf("dobj")); for (CoreLabel word : words) { if (word.tag().length() >= 2 && ("VB".equals(word.tag().substring(0, 2)) || "NN".equals(word.tag().substring(0, 2)))) { IndexedWord iword = new IndexedWord(word); roots.add(iword);//from w w w .j av a 2s . com } } for (IndexedWord word : roots) { Event e = new Event(); e.trigger = word.word(); try { Set<IndexedWord> children = dependencies.getChildren(word); children.stream().forEach((iw) -> { e.arguments.add(new EventArgument(iw.word(), "")); }); if (dependencies.inDegree(word) > 0) { IndexedWord parent = dependencies.getParent(word); if (parent.tag().length() >= 2 && "VB".equals(parent.tag().substring(0, 2))) { Set<IndexedWord> children1 = dependencies.getChildrenWithRelns(parent, senserel); children1.remove(word); children1.stream().forEach((iw) -> { e.arguments.add(new EventArgument(iw.word(), "")); }); } else { e.arguments.add(new EventArgument(dependencies.getParent(word).word(), "")); } } } catch (java.lang.IllegalArgumentException error) { continue; } res.add(e); } return res; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) { CoreLabel p3 = cInfo.get(loc - 3);/*from w w w. ja va 2 s. com*/ CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel c = cInfo.get(loc); CoreLabel n = cInfo.get(loc + 1); CoreLabel n2 = cInfo.get(loc + 2); String cWord = getWord(c); String pWord = getWord(p); String nWord = getWord(n); String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class); Collection<String> featuresC = new ArrayList<String>(); if (flags.useDistSim) { distSimAnnotate(cInfo); } if (flags.useBagOfWords) { for (IN word : cInfo) { featuresC.add(getWord(word) + "-BAGOFWORDS"); } } if (flags.useDistSim && flags.useMoreTags) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM"); } if (flags.useTitle) { Matcher m = titlePattern.matcher(cWord); if (m.matches()) { featuresC.add("IS_TITLE"); } } if (flags.useInternal && flags.useExternal) { if (flags.useWord) { featuresC.add(cWord + "-WORD"); } if (flags.use2W) { featuresC.add(getWord(p2) + "-P2W"); featuresC.add(getWord(n2) + "-N2W"); } if (flags.useLC) { featuresC.add(cWord.toLowerCase() + "-CL"); featuresC.add(pWord.toLowerCase() + "-PL"); featuresC.add(nWord.toLowerCase() + "-NL"); } if (flags.useUnknown) { // for true casing featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class) + "-UNKNOWN"); featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class) + "-PUNKNOWN"); featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class) + "-NUNKNOWN"); } if (flags.useLemmas) { String lem = c.getString(CoreAnnotations.LemmaAnnotation.class); if (!"".equals(lem)) { featuresC.add(lem + "-LEM"); } } if (flags.usePrevNextLemmas) { String plem = p.getString(CoreAnnotations.LemmaAnnotation.class); String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class); if (!"".equals(plem)) { featuresC.add(plem + "-PLEM"); } if (!"".equals(nlem)) { featuresC.add(nlem + "-NLEM"); } } if (flags.checkNameList) { try { if (lastNames == null) { lastNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.lastNameList)) { String[] cols = line.split("\\s+"); lastNames.add(cols[0]); } } if (maleNames == null) { maleNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.maleNameList)) { String[] cols = line.split("\\s+"); maleNames.add(cols[0]); } } if (femaleNames == null) { femaleNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) { String[] cols = line.split("\\s+"); femaleNames.add(cols[0]); } } String name = cWord.toUpperCase(); if (lastNames.contains(name)) { featuresC.add("LAST_NAME"); } if (maleNames.contains(name)) { featuresC.add("MALE_NAME"); } if (femaleNames.contains(name)) { featuresC.add("FEMALE_NAME"); } } catch (Exception e) { throw new RuntimeException(e); } } if (flags.binnedLengths != null) { int len = cWord.length(); String featureName = null; for (int i = 0; i <= flags.binnedLengths.length; i++) { if (i == flags.binnedLengths.length) { featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf"; } else if (len <= flags.binnedLengths[i]) { featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i]; break; } } featuresC.add(featureName); } if (flags.useABGENE) { featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE"); featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE"); featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE"); } if (flags.useABSTRFreqDict) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useABSTR) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"); featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT"); featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT"); } if (flags.useGENIA) { featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA"); featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA"); featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA"); } if (flags.useWEBFreqDict) { featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useWEB) { featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"); featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB"); featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB"); } if (flags.useIsURL) { featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL"); } if (flags.useEntityRule) { featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class) + "-ENTITYRULE"); } if (flags.useEntityTypes) { featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE"); } if (flags.useIsDateRange) { featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE"); } if (flags.useABSTRFreq) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); } if (flags.useFREQ) { featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); } if (flags.useMoreTags) { featuresC.add( p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD"); } if (flags.usePosition) { featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION"); } if (flags.useBeginSent) { String pos = c.get(CoreAnnotations.PositionAnnotation.class); if ("0".equals(pos)) { featuresC.add("BEGIN-SENT"); featuresC.add(cShape + "-BEGIN-SENT"); } else if (Integer.toString(cInfo.size() - 1).equals(pos)) { featuresC.add("END-SENT"); featuresC.add(cShape + "-END-SENT"); } else { featuresC.add("IN-SENT"); featuresC.add(cShape + "-IN-SENT"); } } if (flags.useTags) { featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useOrdinal) { if (isOrdinal(cInfo, loc)) { featuresC.add("C_ORDINAL"); if (isOrdinal(cInfo, loc - 1)) { //System.err.print(getWord(p) + " "); featuresC.add("PC_ORDINAL"); } //System.err.println(cWord); } if (isOrdinal(cInfo, loc - 1)) { featuresC.add("P_ORDINAL"); } } if (flags.usePrev) { featuresC.add(pWord + "-PW"); if (flags.useTags) { featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG"); } if (flags.useDistSim) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM"); } if (flags.useIsURL) { featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL"); } if (flags.useEntityTypes) { featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE"); } } if (flags.useNext) { featuresC.add(nWord + "-NW"); if (flags.useTags) { featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG"); } if (flags.useDistSim) { featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM"); } if (flags.useIsURL) { featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL"); } if (flags.useEntityTypes) { featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE"); } } /*here, entityTypes refers to the type in the PASCAL IE challenge: * i.e. certain words are tagged "Date" or "Location" */ if (flags.useEitherSideWord) { featuresC.add(pWord + "-EW"); featuresC.add(nWord + "-EW"); } if (flags.useWordPairs) { featuresC.add(cWord + '-' + pWord + "-W-PW"); featuresC.add(cWord + '-' + nWord + "-W-NW"); } if (flags.useSymTags) { if (flags.useTags) { featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS"); featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS"); featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS"); } if (flags.useDistSim) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM"); featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM"); featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM"); } } if (flags.useSymWordPairs) { featuresC.add(pWord + '-' + nWord + "-SWORDS"); } String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? p.get(CoreAnnotations.GazAnnotation.class) : null; String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? n.get(CoreAnnotations.GazAnnotation.class) : null; String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? c.get(CoreAnnotations.GazAnnotation.class) : null; if (flags.useGazFeatures) { if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + "-GAZ"); } // n if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) { featuresC.add(nGazAnnotation + "-NGAZ"); } // p if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) { featuresC.add(pGazAnnotation + "-PGAZ"); } } if (flags.useMoreGazFeatures) { if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ"); // c-n if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ"); } // p-c if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) { featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ"); } } } if (flags.useAbbr || flags.useMinimalAbbr) { featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); } if (flags.useAbbr1 || flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); } } if (flags.useAbbr) { featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); } if (flags.useAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); } } if (flags.useChunks) { featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK"); featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK"); featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); } if (flags.useMinimalAbbr) { featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); } if (flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); } } String prevVB = "", nextVB = ""; if (flags.usePrevVB) { for (int j = loc - 1;; j--) { CoreLabel wi = cInfo.get(j); if (wi == cInfo.getPad()) { prevVB = "X"; featuresC.add("X-PVB"); break; } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { featuresC.add(getWord(wi) + "-PVB"); prevVB = getWord(wi); break; } } } if (flags.useNextVB) { for (int j = loc + 1;; j++) { CoreLabel wi = cInfo.get(j); if (wi == cInfo.getPad()) { featuresC.add("X-NVB"); nextVB = "X"; break; } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { featuresC.add(getWord(wi) + "-NVB"); nextVB = getWord(wi); break; } } } if (flags.useVB) { featuresC.add(prevVB + '-' + nextVB + "-PNVB"); } if (flags.useShapeConjunctions) { featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH"); if (flags.useTags) { featuresC.add(c.tag() + cShape + "-TAG-SH"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH"); } } if (flags.useWordTag) { featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T"); featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT"); featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT"); } if (flags.useNPHead) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW"); if (flags.useTags) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T"); } if (flags.useDistSim) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM"); } } if (flags.useNPGovernor) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW"); if (flags.useTags) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1"); } } if (flags.useHeadGov) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW"); } if (flags.useClassFeature) { featuresC.add("###"); } if (flags.useFirstWord) { String firstWord = getWord(cInfo.get(0)); featuresC.add(firstWord); } if (flags.useNGrams) { Collection<String> subs = null; if (flags.cacheNGrams) { subs = wordToSubstrings.get(cWord); } if (subs == null) { subs = new ArrayList<String>(); String word = '<' + cWord + '>'; if (flags.lowercaseNGrams) { word = word.toLowerCase(); } if (flags.dehyphenateNGrams) { word = dehyphenate(word); } if (flags.greekifyNGrams) { word = greekify(word); } // minimum length substring is 2 letters (hardwired) // hoist flags.noMidNGrams so only linear in word length for that case if (flags.noMidNGrams) { int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : word.length(); for (int j = 2; j <= max; j++) { subs.add(intern('#' + word.substring(0, j) + '#')); } int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0; int lenM1 = word.length() - 1; for (int i = start; i < lenM1; i++) { subs.add(intern('#' + word.substring(i) + '#')); } } else { for (int i = 0; i < word.length(); i++) { for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) { if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { continue; } subs.add(intern('#' + word.substring(i, j) + '#')); } } } if (flags.cacheNGrams) { wordToSubstrings.put(cWord, subs); } } featuresC.addAll(subs); if (flags.conjoinShapeNGrams) { for (String str : subs) { String feat = str + '-' + cShape + "-CNGram-CS"; featuresC.add(feat); } } } if (flags.useGazettes) { if (flags.sloppyGazette) { Collection<String> entries = wordToGazetteEntries.get(cWord); if (entries != null) { featuresC.addAll(entries); } } if (flags.cleanGazette) { Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord); if (infos != null) { for (GazetteInfo gInfo : infos) { boolean ok = true; for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) { ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc))); } if (ok) { featuresC.add(gInfo.feature); } } } } } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { featuresC.add(cShape + "-TYPE"); if (flags.useTypeSeqs) { featuresC.add(pShape + "-PTYPE"); featuresC.add(nShape + "-NTYPE"); featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); featuresC.add(pShape + "..." + cShape + "-PCTYPE"); featuresC.add(cShape + "..." + nShape + "-CNTYPE"); featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); } } if (flags.useLastRealWord) { if (pWord.length() <= 3) { // extending this to check for 2 short words doesn't seem to help.... featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); } } if (flags.useNextRealWord) { if (nWord.length() <= 3) { // extending this to check for 2 short words doesn't seem to help.... featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); } } if (flags.useOccurrencePatterns) { featuresC.addAll(occurrencePatterns(cInfo, loc)); } if (flags.useDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { CoreLabel dn = cInfo.get(loc + i); CoreLabel dp = cInfo.get(loc - i); featuresC.add(getWord(dn) + "-DISJN"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); } featuresC.add(getWord(dp) + "-DISJP"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); } } } if (flags.useWideDisjunctive) { for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); } } if (flags.useEitherSideDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE"); } } if (flags.useDisjShape) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); } } if (flags.useExtraTaggySequences) { if (flags.useTags) { featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); } if (flags.useDistSim) { featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); } } if (flags.useMUCFeatures) { featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class) + "-SECTION"); featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + "-WORD_POSITION"); featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class) + "-SENT_POSITION"); featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class) + "-PARA_POSITION"); featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-WORD_POSITION_SHAPE"); } } else if (flags.useInternal) { if (flags.useWord) { featuresC.add(cWord + "-WORD"); } if (flags.useNGrams) { Collection<String> subs = wordToSubstrings.get(cWord); if (subs == null) { subs = new ArrayList<String>(); String word = '<' + cWord + '>'; if (flags.lowercaseNGrams) { word = word.toLowerCase(); } if (flags.dehyphenateNGrams) { word = dehyphenate(word); } if (flags.greekifyNGrams) { word = greekify(word); } for (int i = 0; i < word.length(); i++) { for (int j = i + 2; j <= word.length(); j++) { if (flags.noMidNGrams && i != 0 && j != word.length()) { continue; } if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { continue; } //subs.add(intern("#" + word.substring(i, j) + "#")); subs.add(intern('#' + word.substring(i, j) + '#')); } } if (flags.cacheNGrams) { wordToSubstrings.put(cWord, subs); } } featuresC.addAll(subs); if (flags.conjoinShapeNGrams) { String shape = c.get(CoreAnnotations.ShapeAnnotation.class); for (String str : subs) { String feat = str + '-' + shape + "-CNGram-CS"; featuresC.add(feat); } } } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { featuresC.add(cShape + "-TYPE"); } if (flags.useOccurrencePatterns) { featuresC.addAll(occurrencePatterns(cInfo, loc)); } } else if (flags.useExternal) { if (flags.usePrev) { featuresC.add(pWord + "-PW"); } if (flags.useNext) { featuresC.add(nWord + "-NW"); } if (flags.useWordPairs) { featuresC.add(cWord + '-' + pWord + "-W-PW"); featuresC.add(cWord + '-' + nWord + "-W-NW"); } if (flags.useSymWordPairs) { featuresC.add(pWord + '-' + nWord + "-SWORDS"); } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { if (flags.useTypeSeqs) { featuresC.add(pShape + "-PTYPE"); featuresC.add(nShape + "-NTYPE"); featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); if (flags.maxLeft > 0) featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order featuresC.add(cShape + "..." + nShape + "-CNTYPE"); featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); } } if (flags.useLastRealWord) { if (pWord.length() <= 3) { featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); } } if (flags.useNextRealWord) { if (nWord.length() <= 3) { featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); } } if (flags.useDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { CoreLabel dn = cInfo.get(loc + i); CoreLabel dp = cInfo.get(loc - i); featuresC.add(getWord(dn) + "-DISJN"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); } featuresC.add(getWord(dp) + "-DISJP"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); } } } if (flags.useWideDisjunctive) { for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); } } if (flags.useDisjShape) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); } } } // Stuff to add binary features from the additional columns if (flags.twoStage) { featuresC.add(c.get(Bin1Annotation.class) + "-BIN1"); featuresC.add(c.get(Bin2Annotation.class) + "-BIN2"); featuresC.add(c.get(Bin3Annotation.class) + "-BIN3"); featuresC.add(c.get(Bin4Annotation.class) + "-BIN4"); featuresC.add(c.get(Bin5Annotation.class) + "-BIN5"); featuresC.add(c.get(Bin6Annotation.class) + "-BIN6"); } if (flags.useIfInteger) { try { int val = Integer.parseInt(cWord); if (val > 0) featuresC.add("POSITIVE_INTEGER"); else if (val < 0) featuresC.add("NEGATIVE_INTEGER"); // System.err.println("FOUND INTEGER"); } catch (NumberFormatException e) { // not an integer value, nothing to do } } //Stuff to add arbitrary features if (flags.useGenericFeatures) { //see if we need to cache the keys if (genericAnnotationKeys == null) { makeGenericKeyCache(c); } //now look through the cached keys for (Class key : genericAnnotationKeys) { //System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key)); if (c.get(key) != null && c.get(key) instanceof Collection) { for (Object ob : (Collection) c.get(key)) { featuresC.add(ob + "-" + CoreLabel.genericValues.get(key)); } } else { featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key)); } } } if (flags.useTopics) { //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD"); featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + "-TopicID"); featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID"); featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID"); //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID"); //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID"); //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID"); //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH"); //asdasd } // NER tag annotations from a previous NER system if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) { featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CStackedNERTag"); featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCStackedNERTag"); if (flags.useNext) { featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag"); featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag"); if (flags.usePrev) { featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag"); featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag"); } } if (flags.usePrev) { featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag"); } } if (flags.useWordnetFeatures) featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class) + "-WordnetSyn"); if (flags.useProtoFeatures) featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class) + "-Proto"); if (flags.usePhraseWordTags) featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class) + "-PhraseTag"); if (flags.usePhraseWords) { for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class)) featuresC.add(w + "-PhraseWord"); } if (flags.useCommonWordsFeature) featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class)); if (flags.useRadical && cWord.length() > 0) { if (cWord.length() == 1) { featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-SINGLE-CHAR-RADICAL"); } else { featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-START-RADICAL"); featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + "-END-RADICAL"); } for (int i = 0; i < cWord.length(); ++i) { featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + "-RADICAL"); } } if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) { String[] ws = c.word().split(flags.splitWordRegex); for (String s : ws) { featuresC.add(s + "-SPLITWORD"); } } return featuresC; }
From source file:edu.cuhk.hccl.util.NLPUtil.java
License:Apache License
public static ArrayList<String[]> extractNounPhrases(StanfordCoreNLP pipeline, String text, int searchRange) { ArrayList<String[]> wordPairs = new ArrayList<String[]>(); Annotation document = new Annotation(text); pipeline.annotate(document);// w ww.j a v a2 s . co m List<CoreMap> sentences = document.get(SentencesAnnotation.class); MAX_STEPS = searchRange; for (CoreMap sentence : sentences) { List<CoreLabel> labels = sentence.get(TokensAnnotation.class); // Check negation boolean hasNegation = false; for (CoreLabel label : labels) { if (NEGATIONS.contains(label.lemma().toLowerCase())) { hasNegation = true; } } for (int idx = 0; idx < labels.size(); idx++) { CoreLabel label = labels.get(idx); if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) { for (int step = 1; step <= MAX_STEPS; step++) { CoreLabel leftLabel = labels.get(Math.max(0, idx - step)); if (JJ_TAGS.contains(leftLabel.tag())) { if (hasNegation) addPair(wordPairs, NOT_PREFIX + leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair(wordPairs, leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1)); if (JJ_TAGS.contains(rightLabel.tag())) { if (hasNegation) addPair(wordPairs, NOT_PREFIX + rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair(wordPairs, rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } } } } } return wordPairs; }
From source file:edu.ucla.cs.scai.aztec.ir.tokenization.WordTokenizer.java
License:Apache License
public WordTokenizedDocument tokenize(String text, boolean lemmatize, boolean removeStopWords, boolean toLowerCase) { WordTokenizedDocument res = new WordTokenizedDocument(); Properties propsTokens = new Properties(); propsTokens.put("annotators", "tokenize, ssplit, pos, lemma, ner, regexner"); StanfordCoreNLP pipelineTokens = new StanfordCoreNLP(propsTokens); Annotation qaTokens = new Annotation(text); pipelineTokens.annotate(qaTokens);//from w w w . ja v a2 s . c o m List<CoreMap> sentences = qaTokens.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { WordTokenizedSentence s = new WordTokenizedSentence(); for (CoreLabel cl : (ArrayList<CoreLabel>) sentence.get(CoreAnnotations.TokensAnnotation.class)) { if (!removeStopWords || !stopwords.contains(cl.lemma())) { WordToken t = new WordToken(cl.word(), cl.lemma(), cl.tag()); if (lemmatize) { t.useLemma(); } if (toLowerCase) { t.useLowerCase(); } s.appendToken(t); } } res.appendSentence(s); } return res; }
From source file:ims.cs.corenlp.Helper.java
License:Open Source License
/** * Checks whether a CoreNLP token is a quote * @param token//from w w w. j a v a 2 s.c o m * @return */ public static boolean isQuote(CoreLabel token) { // single quotes are mostly wrong and unhelpful, so ignore!!! return (token.tag().equals("\"") || token.tag().equals("``") || token.tag().equals("''") || token.tag().equals("QUOT")) && !token.tag().equals("'") && !token.tag().equals("`"); }
From source file:ims.cs.corenlp.TokenAligner.java
License:Open Source License
/** * Combines my token and a CoreNlp token using predicted information * @param tok//w ww .j ava2 s.c om * @param cl * @param currentCoreNlpSentenceIndex * @return */ public static Token combineTokensPred(Token tok, CoreLabel cl, int currentCoreNlpSentenceIndex) { Token combined = new Token(tok); combined.predText = cl.word(); combined.predLemma = cl.lemma(); combined.predPosition = -1; /* will be determined by document aligner */ combined.predPosTag = cl.tag(); combined.predSentencePosition = currentCoreNlpSentenceIndex; combined.predNer = Helper.translateNer(cl.ner()); combined.predByteCount = new ByteCount(cl.beginPosition(), cl.endPosition()); return combined; }