List of usage examples for edu.stanford.nlp.parser.lexparser IntTaggedWord IntTaggedWord
public IntTaggedWord(String wordString, String tagString, Index<String> wordIndex, Index<String> tagIndex)
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private void initializeChart(List sentence) { int boundary = wordIndex.indexOf(Lexicon.BOUNDARY); for (int start = 0; start < length; start++) { if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens. // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items // note we don't look for "words" including the end symbol! for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) { StringBuilder word = new StringBuilder(); //wsg: Feb 2010 - Appears to support character-level parsing for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); }//from w w w . j a va2 s .co m } for (int state = 0; state < numStates; state++) { float iS = iScore[start][end][state]; if (iS == Float.NEGATIVE_INFINITY && isTag[state]) { IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex); iScore[start][end][state] = lex.score(itw, start, word.toString(), null); if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) { narrowRExtent[start][state] = start + 1; narrowLExtent[end][state] = end - 1; wideRExtent[start][state] = start + 1; wideLExtent[end][state] = end - 1; } } } } } else { // "normal" chart initialization of the [start,start+1] cell int word = words[start]; int end = start + 1; Arrays.fill(tags[start], false); float[] iScore_start_end = iScore[start][end]; int[] narrowRExtent_start = narrowRExtent[start]; int[] narrowLExtent_end = narrowLExtent[end]; int[] wideRExtent_start = wideRExtent[start]; int[] wideLExtent_end = wideLExtent[end]; //Force tags String trueTagStr = null; if (sentence.get(start) instanceof HasTag) { trueTagStr = ((HasTag) sentence.get(start)).tag(); if ("".equals(trueTagStr)) { trueTagStr = null; } } // Another option for forcing tags: supply a regex String candidateTagRegex = null; /* if (sentence.get(start) instanceof CoreLabel) { candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class); if ("".equals(candidateTagRegex)) { candidateTagRegex = null; } } */ //Word context (e.g., morphosyntactic info) String wordContextStr = null; if (sentence.get(start) instanceof HasContext) { wordContextStr = ((HasContext) sentence.get(start)).originalText(); if ("".equals(wordContextStr)) wordContextStr = null; } boolean assignedSomeTag = false; if (!floodTags || word == boundary) { // in this case we generate the taggings in the lexicon, // which may itself be tagging flexibly or using a strict lexicon. if (dumpTagging) { EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8"); } for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext();) { IntTaggedWord tagging = taggingI.next(); int state = stateIndex.indexOf(tagIndex.get(tagging.tag)); // if word was supplied with a POS tag, skip all taggings // not basicCategory() compatible with supplied tag. if (trueTagStr != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8"); } continue; } } if (candidateTagRegex != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8"); } continue; } } // try { float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon if (lexScore > Float.NEGATIVE_INFINITY) { assignedSomeTag = true; iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag)); // } int tag = tagging.tag; tags[start][tag] = true; if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } //if (start == length-2 && tagging.parent == puncTag) // lastIsPunc = true; } } // end if ( ! floodTags || word == boundary) if (!assignedSomeTag) { // If you got here, either you were using forceTags (gold tags) // and the gold tag was not seen with that word in the training data // or we are in floodTags=true (recovery parse) mode // Here, we give words all tags for // which the lexicon score is not -Inf, not just seen or // specified taggings if (dumpTagging) { EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { if (trueTagStr != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).equals(trueTagStr)) { continue; } } float lexScore = lex.score( new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr); if (candidateTagRegex != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) { continue; } } if (lexScore > Float.NEGATIVE_INFINITY) { iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } } } } // end if ! assignedSomeTag // tag multi-counting if (op.dcTags) { for (int state = 0; state < numStates; state++) { if (isTag[state]) { iScore_start_end[state] *= (1.0 + op.testOptions.depWeight); } } } if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) { // if parse failed because of tag coverage, we put in all tags with // a score of -1000, by fiat. You get here from the invocation of // parse(ls) inside parse(ls) *after* floodTags has been turned on. // Search above for "floodTags = true". if (dumpTagging) { EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { iScore_start_end[state] = -1000.0f; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } } } // Apply unary rules in diagonal cells of chart if (spillGuts) { tick("Terminal Unary..."); } for (int state = 0; state < numStates; state++) { float iS = iScore_start_end[state]; if (iS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] unaries = ug.closedRulesByChild(state); for (UnaryRule ur : unaries) { int parentState = ur.parent; float pS = ur.score + lex.score(ur, start, end); float tot = iS + pS; if (tot > iScore_start_end[parentState]) { iScore_start_end[parentState] = tot; narrowRExtent_start[parentState] = end; narrowLExtent_end[parentState] = start; wideRExtent_start[parentState] = end; wideLExtent_end[parentState] = start; } } } if (spillGuts) { tick("Next word..."); } } } // end for start }
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private Tree extractBestParse(int goal, int start, int end) { // find source of inside score // no backtraces so we can speed up the parsing for its primary use double bestScore = iScore[start][end][goal]; double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal]) : bestScore;//w w w . j av a2s. c om String goalStr = stateIndex.get(goal); // check tags if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) { if (op.testOptions.maxSpanForTags > 1) { Tree wordNode = null; if (sentence != null) { StringBuilder word = new StringBuilder(); for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); } } wordNode = tf.newLeaf(word.toString()); } else if (lr != null) { List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end); for (LatticeEdge edge : latticeEdges) { IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex); float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null); if (matches(bestScore, tagScore + (float) edge.weight)) { wordNode = tf.newLeaf(edge.word); if (wordNode.label() instanceof CoreLabel) { CoreLabel cl = (CoreLabel) wordNode.label(); cl.setBeginPosition(start); cl.setEndPosition(end); } break; } } if (wordNode == null) { throw new RuntimeException( "could not find matching word from lattice in parse reconstruction"); } } else { throw new RuntimeException("attempt to get word when sentence and lattice are null!"); } Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (originalTags[start] != null) { tagNode.label().setValue(originalTags[start].tag()); } return tagNode; } else { // normal lexicon is single words case IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr)); String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) { // return a pre-terminal tree CoreLabel terminalLabel = getCoreLabel(start); Tree wordNode = tf.newLeaf(terminalLabel); Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (terminalLabel.tag() != null) { tagNode.label().setValue(terminalLabel.tag()); } if (tagNode.label() instanceof HasTag) { ((HasTag) tagNode.label()).setTag(tagNode.label().value()); } return tagNode; } } } // check binaries first for (int split = start + 1; split < end; split++) { for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) { BinaryRule br = binaryI.next(); double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild] + lex.score(br, start, end, split); boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]); matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (matches) { // build binary split Tree leftChildTree = extractBestParse(br.leftChild, start, split); Tree rightChildTree = extractBestParse(br.rightChild, split, end); List<Tree> children = new ArrayList<Tree>(); children.add(leftChildTree); children.add(rightChildTree); Tree result = tf.newTreeNode(goalStr, children); result.setScore(score); // System.err.println(" Found Binary node: "+result); return result; } } } // check unaries // note that even though we parse with the unary-closed grammar, we can // extract the best parse with the non-unary-closed grammar, since all // the intermediate states in the chain must have been built, and hence // we can exploit the sparser space and reconstruct the full tree as we go. // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) { for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) { UnaryRule ur = unaryI.next(); // System.err.println(" Trying " + ur + " dtr score: " + iScore[start][end][ur.child]); double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end); boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / wordsInSpan[start][end][ur.child]; matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (ur.child != ur.parent && matches) { // build unary Tree childTree = extractBestParse(ur.child, start, end); Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree)); // System.err.println(" Matched! Unary node: "+result); result.setScore(score); return result; } } System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr); return null; }