Example usage for edu.stanford.nlp.parser.lexparser IntTaggedWord tagString

Introduction

In this page you can find the example usage for edu.stanford.nlp.parser.lexparser IntTaggedWord tagString.

Prototype

public String tagString(Index<String> tagIndex)

Source Link

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private void initializeChart(List sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);

    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens.
            // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags)
                    || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }//from  w  w w  . j  a v a  2 s  .  co  m
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex,
                                tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }

        } else { // "normal" chart initialization of the [start,start+1] cell

            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);

            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];

            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }

            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            /* if (sentence.get(start) instanceof CoreLabel) {
               candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class);
               if ("".equals(candidateTagRegex)) {
                 candidateTagRegex = null;
               }
             }
            */
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }

            boolean assignedSomeTag = false;

            if (!floodTags || word == boundary) {
                // in this case we generate the taggings in the lexicon,
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]",
                            "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start,
                        wordContextStr); taggingI.hasNext();) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // if word was supplied with a POS tag, skip all taggings
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match candidateTagRegex: " + candidateTagRegex,
                                        "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score "
                                + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state
                                + "]", "UTF-8");
                    }
                    //if (start == length-2 && tagging.parent == puncTag)
                    //  lastIsPunc = true;
                }
            } // end if ( ! floodTags || word == boundary)

            if (!assignedSomeTag) {
                // If you got here, either you were using forceTags (gold tags)
                // and the gold tag was not seen with that word in the training data
                // or we are in floodTags=true (recovery parse) mode
                // Here, we give words all tags for
                // which the lexicon score is not -Inf, not just seen or
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }

                        float lexScore = lex.score(
                                new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start,
                                wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }

                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging "
                                    + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))))
                                    + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state)
                                    + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            } // end if ! assignedSomeTag

            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }

            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // if parse failed because of tag coverage, we put in all tags with
                // a score of -1000, by fiat.  You get here from the invocation of
                // parse(ls) inside parse(ls) *after* floodTags has been turned on.
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }

            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score + lex.score(ur, start, end);
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    } // end for start
}