Example usage for edu.stanford.nlp.parser.lexparser IntTaggedWord IntTaggedWord

List of usage examples for edu.stanford.nlp.parser.lexparser IntTaggedWord IntTaggedWord

Introduction

In this page you can find the example usage for edu.stanford.nlp.parser.lexparser IntTaggedWord IntTaggedWord.

Prototype

public IntTaggedWord(String wordString, String tagString, Index<String> wordIndex, Index<String> tagIndex) 

Source Link

Document

Creates an IntTaggedWord given by the tagString and wordString

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private void initializeChart(List sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);

    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens.
            // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags)
                    || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }//from   w  w  w  .  j a  va2 s  .co  m
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex,
                                tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }

        } else { // "normal" chart initialization of the [start,start+1] cell

            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);

            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];

            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }

            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            /* if (sentence.get(start) instanceof CoreLabel) {
               candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class);
               if ("".equals(candidateTagRegex)) {
                 candidateTagRegex = null;
               }
             }
            */
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }

            boolean assignedSomeTag = false;

            if (!floodTags || word == boundary) {
                // in this case we generate the taggings in the lexicon,
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]",
                            "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start,
                        wordContextStr); taggingI.hasNext();) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // if word was supplied with a POS tag, skip all taggings
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match candidateTagRegex: " + candidateTagRegex,
                                        "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score "
                                + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state
                                + "]", "UTF-8");
                    }
                    //if (start == length-2 && tagging.parent == puncTag)
                    //  lastIsPunc = true;
                }
            } // end if ( ! floodTags || word == boundary)

            if (!assignedSomeTag) {
                // If you got here, either you were using forceTags (gold tags)
                // and the gold tag was not seen with that word in the training data
                // or we are in floodTags=true (recovery parse) mode
                // Here, we give words all tags for
                // which the lexicon score is not -Inf, not just seen or
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }

                        float lexScore = lex.score(
                                new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start,
                                wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }

                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging "
                                    + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))))
                                    + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state)
                                    + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            } // end if ! assignedSomeTag

            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }

            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // if parse failed because of tag coverage, we put in all tags with
                // a score of -1000, by fiat.  You get here from the invocation of
                // parse(ls) inside parse(ls) *after* floodTags has been turned on.
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }

            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score + lex.score(ur, start, end);
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    } // end for start
}

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private Tree extractBestParse(int goal, int start, int end) {
    // find source of inside score
    // no backtraces so we can speed up the parsing for its primary use
    double bestScore = iScore[start][end][goal];
    double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal])
            : bestScore;//w  w  w  .  j av  a2s.  c  om
    String goalStr = stateIndex.get(goal);

    // check tags
    if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) {
        if (op.testOptions.maxSpanForTags > 1) {
            Tree wordNode = null;
            if (sentence != null) {
                StringBuilder word = new StringBuilder();
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                wordNode = tf.newLeaf(word.toString());

            } else if (lr != null) {
                List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end);
                for (LatticeEdge edge : latticeEdges) {
                    IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex);

                    float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null);
                    if (matches(bestScore, tagScore + (float) edge.weight)) {
                        wordNode = tf.newLeaf(edge.word);
                        if (wordNode.label() instanceof CoreLabel) {
                            CoreLabel cl = (CoreLabel) wordNode.label();
                            cl.setBeginPosition(start);
                            cl.setEndPosition(end);
                        }
                        break;
                    }
                }
                if (wordNode == null) {
                    throw new RuntimeException(
                            "could not find matching word from lattice in parse reconstruction");
                }

            } else {
                throw new RuntimeException("attempt to get word when sentence and lattice are null!");
            }
            Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
            tagNode.setScore(bestScore);
            if (originalTags[start] != null) {
                tagNode.label().setValue(originalTags[start].tag());
            }
            return tagNode;
        } else { // normal lexicon is single words case
            IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr));
            String contextStr = getCoreLabel(start).originalText();
            float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr);
            if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
                // return a pre-terminal tree
                CoreLabel terminalLabel = getCoreLabel(start);

                Tree wordNode = tf.newLeaf(terminalLabel);
                Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
                tagNode.setScore(bestScore);
                if (terminalLabel.tag() != null) {
                    tagNode.label().setValue(terminalLabel.tag());
                }
                if (tagNode.label() instanceof HasTag) {
                    ((HasTag) tagNode.label()).setTag(tagNode.label().value());
                }
                return tagNode;
            }
        }
    }
    // check binaries first
    for (int split = start + 1; split < end; split++) {
        for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) {
            BinaryRule br = binaryI.next();
            double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild]
                    + lex.score(br, start, end, split);
            boolean matches;
            if (op.testOptions.lengthNormalization) {
                double normScore = score
                        / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]);
                matches = matches(normScore, normBestScore);
            } else {
                matches = matches(score, bestScore);
            }
            if (matches) {
                // build binary split
                Tree leftChildTree = extractBestParse(br.leftChild, start, split);
                Tree rightChildTree = extractBestParse(br.rightChild, split, end);
                List<Tree> children = new ArrayList<Tree>();
                children.add(leftChildTree);
                children.add(rightChildTree);
                Tree result = tf.newTreeNode(goalStr, children);
                result.setScore(score);
                // System.err.println("    Found Binary node: "+result);
                return result;
            }
        }
    }
    // check unaries
    // note that even though we parse with the unary-closed grammar, we can
    // extract the best parse with the non-unary-closed grammar, since all
    // the intermediate states in the chain must have been built, and hence
    // we can exploit the sparser space and reconstruct the full tree as we go.
    // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) {
    for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) {
        UnaryRule ur = unaryI.next();
        // System.err.println("  Trying " + ur + " dtr score: " + iScore[start][end][ur.child]);
        double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end);
        boolean matches;
        if (op.testOptions.lengthNormalization) {
            double normScore = score / wordsInSpan[start][end][ur.child];
            matches = matches(normScore, normBestScore);
        } else {
            matches = matches(score, bestScore);
        }
        if (ur.child != ur.parent && matches) {
            // build unary
            Tree childTree = extractBestParse(ur.child, start, end);
            Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree));
            // System.err.println("    Matched!  Unary node: "+result);
            result.setScore(score);
            return result;
        }
    }
    System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start
            + ", " + end + "] looking for " + goalStr);
    return null;
}