Example usage for edu.stanford.nlp.parser.lexparser IntTaggedWord IntTaggedWord

Introduction

In this page you can find the example usage for edu.stanford.nlp.parser.lexparser IntTaggedWord IntTaggedWord.

Prototype

public IntTaggedWord(String wordString, String tagString, Index<String> wordIndex, Index<String> tagIndex)

Source Link

Document

Creates an IntTaggedWord given by the tagString and wordString

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private void initializeChart(List sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);

    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens.
            // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags)
                    || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }//from   w  w  w  .  j a  va2 s  .co  m
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex,
                                tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }

        } else { // "normal" chart initialization of the [start,start+1] cell

            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);

            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];

            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }

            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            /* if (sentence.get(start) instanceof CoreLabel) {
               candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class);
               if ("".equals(candidateTagRegex)) {
                 candidateTagRegex = null;
               }
             }
            */
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }

            boolean assignedSomeTag = false;

            if (!floodTags || word == boundary) {
                // in this case we generate the taggings in the lexicon,
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]",
                            "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start,
                        wordContextStr); taggingI.hasNext();) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // if word was supplied with a POS tag, skip all taggings
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match candidateTagRegex: " + candidateTagRegex,
                                        "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score "
                                + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state
                                + "]", "UTF-8");
                    }
                    //if (start == length-2 && tagging.parent == puncTag)
                    //  lastIsPunc = true;
                }
            } // end if ( ! floodTags || word == boundary)

            if (!assignedSomeTag) {
                // If you got here, either you were using forceTags (gold tags)
                // and the gold tag was not seen with that word in the training data
                // or we are in floodTags=true (recovery parse) mode
                // Here, we give words all tags for
                // which the lexicon score is not -Inf, not just seen or
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }

                        float lexScore = lex.score(
                                new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start,
                                wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }

                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging "
                                    + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))))
                                    + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state)
                                    + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            } // end if ! assignedSomeTag

            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }

            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // if parse failed because of tag coverage, we put in all tags with
                // a score of -1000, by fiat.  You get here from the invocation of
                // parse(ls) inside parse(ls) *after* floodTags has been turned on.
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }

            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score + lex.score(ur, start, end);
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    } // end for start
}

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private Tree extractBestParse(int goal, int start, int end) {
    // find source of inside score
    // no backtraces so we can speed up the parsing for its primary use
    double bestScore = iScore[start][end][goal];
    double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal])
            : bestScore;//w  w  w  .  j av  a2s.  c  om
    String goalStr = stateIndex.get(goal);

    // check tags
    if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) {
        if (op.testOptions.maxSpanForTags > 1) {
            Tree wordNode = null;
            if (sentence != null) {
                StringBuilder word = new StringBuilder();
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                wordNode = tf.newLeaf(word.toString());

            } else if (lr != null) {
                List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end);
                for (LatticeEdge edge : latticeEdges) {
                    IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex);

                    float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null);
                    if (matches(bestScore, tagScore + (float) edge.weight)) {
                        wordNode = tf.newLeaf(edge.word);
                        if (wordNode.label() instanceof CoreLabel) {
                            CoreLabel cl = (CoreLabel) wordNode.label();
                            cl.setBeginPosition(start);
                            cl.setEndPosition(end);
                        }
                        break;
                    }
                }
                if (wordNode == null) {
                    throw new RuntimeException(
                            "could not find matching word from lattice in parse reconstruction");
                }

            } else {
                throw new RuntimeException("attempt to get word when sentence and lattice are null!");
            }
            Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
            tagNode.setScore(bestScore);
            if (originalTags[start] != null) {
                tagNode.label().setValue(originalTags[start].tag());
            }
            return tagNode;
        } else { // normal lexicon is single words case
            IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr));
            String contextStr = getCoreLabel(start).originalText();
            float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr);
            if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
                // return a pre-terminal tree
                CoreLabel terminalLabel = getCoreLabel(start);

                Tree wordNode = tf.newLeaf(terminalLabel);
                Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
                tagNode.setScore(bestScore);
                if (terminalLabel.tag() != null) {
                    tagNode.label().setValue(terminalLabel.tag());
                }
                if (tagNode.label() instanceof HasTag) {
                    ((HasTag) tagNode.label()).setTag(tagNode.label().value());
                }
                return tagNode;
            }
        }
    }
    // check binaries first
    for (int split = start + 1; split < end; split++) {
        for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) {
            BinaryRule br = binaryI.next();
            double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild]
                    + lex.score(br, start, end, split);
            boolean matches;
            if (op.testOptions.lengthNormalization) {
                double normScore = score
                        / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]);
                matches = matches(normScore, normBestScore);
            } else {
                matches = matches(score, bestScore);
            }
            if (matches) {
                // build binary split
                Tree leftChildTree = extractBestParse(br.leftChild, start, split);
                Tree rightChildTree = extractBestParse(br.rightChild, split, end);
                List<Tree> children = new ArrayList<Tree>();
                children.add(leftChildTree);
                children.add(rightChildTree);
                Tree result = tf.newTreeNode(goalStr, children);
                result.setScore(score);
                // System.err.println("    Found Binary node: "+result);
                return result;
            }
        }
    }
    // check unaries
    // note that even though we parse with the unary-closed grammar, we can
    // extract the best parse with the non-unary-closed grammar, since all
    // the intermediate states in the chain must have been built, and hence
    // we can exploit the sparser space and reconstruct the full tree as we go.
    // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) {
    for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) {
        UnaryRule ur = unaryI.next();
        // System.err.println("  Trying " + ur + " dtr score: " + iScore[start][end][ur.child]);
        double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end);
        boolean matches;
        if (op.testOptions.lengthNormalization) {
            double normScore = score / wordsInSpan[start][end][ur.child];
            matches = matches(normScore, normBestScore);
        } else {
            matches = matches(score, bestScore);
        }
        if (ur.child != ur.parent && matches) {
            // build unary
            Tree childTree = extractBestParse(ur.child, start, end);
            Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree));
            // System.err.println("    Matched!  Unary node: "+result);
            result.setScore(score);
            return result;
        }
    }
    System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start
            + ", " + end + "] looking for " + goalStr);
    return null;
}