Example usage for edu.stanford.nlp.parser.lexparser Lexicon BOUNDARY

Introduction

In this page you can find the example usage for edu.stanford.nlp.parser.lexparser Lexicon BOUNDARY.

Prototype

String BOUNDARY

To view the source code for edu.stanford.nlp.parser.lexparser Lexicon BOUNDARY.

Click Source Link

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private void initializeChart(List sentence) {
    int boundary = wordIndex.indexOf(Lexicon.BOUNDARY);

    for (int start = 0; start < length; start++) {
        if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens.
            // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items
            // note we don't look for "words" including the end symbol!
            for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags)
                    || (start + 1 == end); end++) {
                StringBuilder word = new StringBuilder();
                //wsg: Feb 2010 - Appears to support character-level parsing
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }//from w w  w . j ava 2  s . c o  m
                }
                for (int state = 0; state < numStates; state++) {
                    float iS = iScore[start][end][state];
                    if (iS == Float.NEGATIVE_INFINITY && isTag[state]) {
                        IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex,
                                tagIndex);
                        iScore[start][end][state] = lex.score(itw, start, word.toString(), null);
                        if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) {
                            narrowRExtent[start][state] = start + 1;
                            narrowLExtent[end][state] = end - 1;
                            wideRExtent[start][state] = start + 1;
                            wideLExtent[end][state] = end - 1;
                        }
                    }
                }
            }

        } else { // "normal" chart initialization of the [start,start+1] cell

            int word = words[start];
            int end = start + 1;
            Arrays.fill(tags[start], false);

            float[] iScore_start_end = iScore[start][end];
            int[] narrowRExtent_start = narrowRExtent[start];
            int[] narrowLExtent_end = narrowLExtent[end];
            int[] wideRExtent_start = wideRExtent[start];
            int[] wideLExtent_end = wideLExtent[end];

            //Force tags
            String trueTagStr = null;
            if (sentence.get(start) instanceof HasTag) {
                trueTagStr = ((HasTag) sentence.get(start)).tag();
                if ("".equals(trueTagStr)) {
                    trueTagStr = null;
                }
            }

            // Another option for forcing tags: supply a regex
            String candidateTagRegex = null;
            /* if (sentence.get(start) instanceof CoreLabel) {
               candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class);
               if ("".equals(candidateTagRegex)) {
                 candidateTagRegex = null;
               }
             }
            */
            //Word context (e.g., morphosyntactic info)
            String wordContextStr = null;
            if (sentence.get(start) instanceof HasContext) {
                wordContextStr = ((HasContext) sentence.get(start)).originalText();
                if ("".equals(wordContextStr))
                    wordContextStr = null;
            }

            boolean assignedSomeTag = false;

            if (!floodTags || word == boundary) {
                // in this case we generate the taggings in the lexicon,
                // which may itself be tagging flexibly or using a strict lexicon.
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]",
                            "UTF-8");
                }
                for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start,
                        wordContextStr); taggingI.hasNext();) {
                    IntTaggedWord tagging = taggingI.next();
                    int state = stateIndex.indexOf(tagIndex.get(tagging.tag));
                    // if word was supplied with a POS tag, skip all taggings
                    // not basicCategory() compatible with supplied tag.
                    if (trueTagStr != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).startsWith(trueTagStr))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8");
                            }
                            continue;
                        }
                    }
                    if (candidateTagRegex != null) {
                        if ((!op.testOptions.forceTagBeginnings
                                && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex))
                                || (op.testOptions.forceTagBeginnings
                                        && !tagging.tagString(tagIndex).matches(candidateTagRegex))) {
                            if (dumpTagging) {
                                EncodingPrintWriter.err.println("  Skipping " + tagging
                                        + " as it doesn't match candidateTagRegex: " + candidateTagRegex,
                                        "UTF-8");
                            }
                            continue;
                        }
                    }
                    // try {
                    float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon
                    if (lexScore > Float.NEGATIVE_INFINITY) {
                        assignedSomeTag = true;
                        iScore_start_end[state] = lexScore;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                    // } catch (Exception e) {
                    // e.printStackTrace();
                    // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag));
                    // }
                    int tag = tagging.tag;
                    tags[start][tag] = true;
                    if (dumpTagging) {
                        EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score "
                                + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state
                                + "]", "UTF-8");
                    }
                    //if (start == length-2 && tagging.parent == puncTag)
                    //  lastIsPunc = true;
                }
            } // end if ( ! floodTags || word == boundary)

            if (!assignedSomeTag) {
                // If you got here, either you were using forceTags (gold tags)
                // and the gold tag was not seen with that word in the training data
                // or we are in floodTags=true (recovery parse) mode
                // Here, we give words all tags for
                // which the lexicon score is not -Inf, not just seen or
                // specified taggings
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        if (trueTagStr != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).equals(trueTagStr)) {
                                continue;
                            }
                        }

                        float lexScore = lex.score(
                                new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start,
                                wordIndex.get(word), wordContextStr);
                        if (candidateTagRegex != null) {
                            String tagString = stateIndex.get(state);
                            if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) {
                                continue;
                            }
                        }

                        if (lexScore > Float.NEGATIVE_INFINITY) {
                            iScore_start_end[state] = lexScore;
                            narrowRExtent_start[state] = end;
                            narrowLExtent_end[state] = start;
                            wideRExtent_start[state] = end;
                            wideLExtent_end[state] = start;
                        }
                        if (dumpTagging) {
                            EncodingPrintWriter.err.println("Word pos " + start + " tagging "
                                    + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))))
                                    + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state)
                                    + " = " + state + "]", "UTF-8");
                        }
                    }
                }
            } // end if ! assignedSomeTag

            // tag multi-counting
            if (op.dcTags) {
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state]) {
                        iScore_start_end[state] *= (1.0 + op.testOptions.depWeight);
                    }
                }
            }

            if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) {
                // if parse failed because of tag coverage, we put in all tags with
                // a score of -1000, by fiat.  You get here from the invocation of
                // parse(ls) inside parse(ls) *after* floodTags has been turned on.
                // Search above for "floodTags = true".
                if (dumpTagging) {
                    EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8");
                }
                for (int state = 0; state < numStates; state++) {
                    if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) {
                        iScore_start_end[state] = -1000.0f;
                        narrowRExtent_start[state] = end;
                        narrowLExtent_end[state] = start;
                        wideRExtent_start[state] = end;
                        wideLExtent_end[state] = start;
                    }
                }
            }

            // Apply unary rules in diagonal cells of chart
            if (spillGuts) {
                tick("Terminal Unary...");
            }
            for (int state = 0; state < numStates; state++) {
                float iS = iScore_start_end[state];
                if (iS == Float.NEGATIVE_INFINITY) {
                    continue;
                }
                UnaryRule[] unaries = ug.closedRulesByChild(state);
                for (UnaryRule ur : unaries) {
                    int parentState = ur.parent;
                    float pS = ur.score + lex.score(ur, start, end);
                    float tot = iS + pS;
                    if (tot > iScore_start_end[parentState]) {
                        iScore_start_end[parentState] = tot;
                        narrowRExtent_start[parentState] = end;
                        narrowLExtent_end[parentState] = start;
                        wideRExtent_start[parentState] = end;
                        wideLExtent_end[parentState] = start;
                    }
                }
            }
            if (spillGuts) {
                tick("Next word...");
            }
        }
    } // end for start
}

From source file:reck.parser.lexparser.RECKLexicalizedParser.java

License:Open Source License

/**
 * Parse a sentence represented as a List of tokens.
 * The text must already have been tokenized and
 * normalized into tokens that are appropriate to the treebank
 * which was used to train the parser.  The tokens can be of
 * multiple types, and the list items need not be homogeneous as to type
 * (in particular, only some words might be given tags):
 * <ul>//from   w w w. j a  v a 2 s.  c om
 * <li>If a token implements HasWord, then the word to be parsed is
 * given by its word() value.</li>
 * <li>If a token implements HasTag and the tag() value is not
 * null or the empty String, then the parser is strongly advised to assign
 * a part of speech tag that <i>begins</i> with this String.</li>
 * <li>Otherwise toString() is called on the token, and the returned
 * value is used as the word to be parsed.  In particular, if the
 * token is already a String, this means that the String is used as
 * the word to be parsed.</li>
 * </ul>
 *
 * @param sentence The sentence to parse
 * @return true Iff the sentence was accepted by the grammar
 * @throws UnsupportedOperationException If the Sentence is too long or
 *                                       of zero length or the parse
 *                                       otherwise fails for resource reasons
 */
@Override
public boolean parse(List<? extends HasWord> sentence) {
    int length = sentence.size();
    if (length == 0) {
        throw new UnsupportedOperationException("Can't parse a zero-length sentence!");
    }
    List<HasWord> sentenceB = new ArrayList<HasWord>(sentence);
    if (Test.addMissingFinalPunctuation) {
        addSentenceFinalPunctIfNeeded(sentenceB, length);
    }
    if (length > Test.maxLength) {
        throw new UnsupportedOperationException("Sentence too long: length " + length);
    }
    TreePrint treePrint = getTreePrint();
    PrintWriter pwOut = op.tlpParams.pw();
    parseSucceeded = false;
    sentenceB.add(new Word(Lexicon.BOUNDARY));
    if (op.doPCFG) {
        if (!pparser.parse(sentenceB)) {
            return parseSucceeded;
        }
        if (Test.verbose) {
            System.out.println("PParser output");
            // pwOut.println(debinarizer.transformTree(pparser.getBestParse())); // with scores on nodes
            treePrint.printTree(debinarizer.transformTree(pparser.getBestParse()), pwOut);
        }
    }
    if (op.doDep && !Test.useFastFactored) {
        if (!dparser.parse(sentenceB)) {
            return parseSucceeded;
        }
        // cdm nov 2006: should move these printing bits to the main printing section,
        // so don't calculate the best parse twice!
        if (Test.verbose) {
            System.out.println("DParser output");
            treePrint.printTree(dparser.getBestParse(), pwOut);
        }
    }
    if (op.doPCFG && op.doDep) {
        if (!bparser.parse(sentenceB)) {
            return parseSucceeded;
        } else {
            parseSucceeded = true;
        }
    }
    return true;
}