List of usage examples for edu.stanford.nlp.parser.lexparser Lexicon BOUNDARY
String BOUNDARY
To view the source code for edu.stanford.nlp.parser.lexparser Lexicon BOUNDARY.
Click Source Link
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private void initializeChart(List sentence) { int boundary = wordIndex.indexOf(Lexicon.BOUNDARY); for (int start = 0; start < length; start++) { if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens. // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items // note we don't look for "words" including the end symbol! for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) { StringBuilder word = new StringBuilder(); //wsg: Feb 2010 - Appears to support character-level parsing for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); }//from w w w . j ava 2 s . c o m } for (int state = 0; state < numStates; state++) { float iS = iScore[start][end][state]; if (iS == Float.NEGATIVE_INFINITY && isTag[state]) { IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex); iScore[start][end][state] = lex.score(itw, start, word.toString(), null); if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) { narrowRExtent[start][state] = start + 1; narrowLExtent[end][state] = end - 1; wideRExtent[start][state] = start + 1; wideLExtent[end][state] = end - 1; } } } } } else { // "normal" chart initialization of the [start,start+1] cell int word = words[start]; int end = start + 1; Arrays.fill(tags[start], false); float[] iScore_start_end = iScore[start][end]; int[] narrowRExtent_start = narrowRExtent[start]; int[] narrowLExtent_end = narrowLExtent[end]; int[] wideRExtent_start = wideRExtent[start]; int[] wideLExtent_end = wideLExtent[end]; //Force tags String trueTagStr = null; if (sentence.get(start) instanceof HasTag) { trueTagStr = ((HasTag) sentence.get(start)).tag(); if ("".equals(trueTagStr)) { trueTagStr = null; } } // Another option for forcing tags: supply a regex String candidateTagRegex = null; /* if (sentence.get(start) instanceof CoreLabel) { candidateTagRegex = ((CoreLabel) sentence.get(start)).get(CandidatePartOfSpeechAnnotation.class); if ("".equals(candidateTagRegex)) { candidateTagRegex = null; } } */ //Word context (e.g., morphosyntactic info) String wordContextStr = null; if (sentence.get(start) instanceof HasContext) { wordContextStr = ((HasContext) sentence.get(start)).originalText(); if ("".equals(wordContextStr)) wordContextStr = null; } boolean assignedSomeTag = false; if (!floodTags || word == boundary) { // in this case we generate the taggings in the lexicon, // which may itself be tagging flexibly or using a strict lexicon. if (dumpTagging) { EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8"); } for (Iterator<IntTaggedWord> taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext();) { IntTaggedWord tagging = taggingI.next(); int state = stateIndex.indexOf(tagIndex.get(tagging.tag)); // if word was supplied with a POS tag, skip all taggings // not basicCategory() compatible with supplied tag. if (trueTagStr != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8"); } continue; } } if (candidateTagRegex != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8"); } continue; } } // try { float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon if (lexScore > Float.NEGATIVE_INFINITY) { assignedSomeTag = true; iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag)); // } int tag = tagging.tag; tags[start][tag] = true; if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } //if (start == length-2 && tagging.parent == puncTag) // lastIsPunc = true; } } // end if ( ! floodTags || word == boundary) if (!assignedSomeTag) { // If you got here, either you were using forceTags (gold tags) // and the gold tag was not seen with that word in the training data // or we are in floodTags=true (recovery parse) mode // Here, we give words all tags for // which the lexicon score is not -Inf, not just seen or // specified taggings if (dumpTagging) { EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { if (trueTagStr != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).equals(trueTagStr)) { continue; } } float lexScore = lex.score( new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr); if (candidateTagRegex != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) { continue; } } if (lexScore > Float.NEGATIVE_INFINITY) { iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } } } } // end if ! assignedSomeTag // tag multi-counting if (op.dcTags) { for (int state = 0; state < numStates; state++) { if (isTag[state]) { iScore_start_end[state] *= (1.0 + op.testOptions.depWeight); } } } if (floodTags && (!op.testOptions.noRecoveryTagging) && !(word == boundary)) { // if parse failed because of tag coverage, we put in all tags with // a score of -1000, by fiat. You get here from the invocation of // parse(ls) inside parse(ls) *after* floodTags has been turned on. // Search above for "floodTags = true". if (dumpTagging) { EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { iScore_start_end[state] = -1000.0f; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } } } // Apply unary rules in diagonal cells of chart if (spillGuts) { tick("Terminal Unary..."); } for (int state = 0; state < numStates; state++) { float iS = iScore_start_end[state]; if (iS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] unaries = ug.closedRulesByChild(state); for (UnaryRule ur : unaries) { int parentState = ur.parent; float pS = ur.score + lex.score(ur, start, end); float tot = iS + pS; if (tot > iScore_start_end[parentState]) { iScore_start_end[parentState] = tot; narrowRExtent_start[parentState] = end; narrowLExtent_end[parentState] = start; wideRExtent_start[parentState] = end; wideLExtent_end[parentState] = start; } } } if (spillGuts) { tick("Next word..."); } } } // end for start }
From source file:reck.parser.lexparser.RECKLexicalizedParser.java
License:Open Source License
/** * Parse a sentence represented as a List of tokens. * The text must already have been tokenized and * normalized into tokens that are appropriate to the treebank * which was used to train the parser. The tokens can be of * multiple types, and the list items need not be homogeneous as to type * (in particular, only some words might be given tags): * <ul>//from w w w. j a v a 2 s. c om * <li>If a token implements HasWord, then the word to be parsed is * given by its word() value.</li> * <li>If a token implements HasTag and the tag() value is not * null or the empty String, then the parser is strongly advised to assign * a part of speech tag that <i>begins</i> with this String.</li> * <li>Otherwise toString() is called on the token, and the returned * value is used as the word to be parsed. In particular, if the * token is already a String, this means that the String is used as * the word to be parsed.</li> * </ul> * * @param sentence The sentence to parse * @return true Iff the sentence was accepted by the grammar * @throws UnsupportedOperationException If the Sentence is too long or * of zero length or the parse * otherwise fails for resource reasons */ @Override public boolean parse(List<? extends HasWord> sentence) { int length = sentence.size(); if (length == 0) { throw new UnsupportedOperationException("Can't parse a zero-length sentence!"); } List<HasWord> sentenceB = new ArrayList<HasWord>(sentence); if (Test.addMissingFinalPunctuation) { addSentenceFinalPunctIfNeeded(sentenceB, length); } if (length > Test.maxLength) { throw new UnsupportedOperationException("Sentence too long: length " + length); } TreePrint treePrint = getTreePrint(); PrintWriter pwOut = op.tlpParams.pw(); parseSucceeded = false; sentenceB.add(new Word(Lexicon.BOUNDARY)); if (op.doPCFG) { if (!pparser.parse(sentenceB)) { return parseSucceeded; } if (Test.verbose) { System.out.println("PParser output"); // pwOut.println(debinarizer.transformTree(pparser.getBestParse())); // with scores on nodes treePrint.printTree(debinarizer.transformTree(pparser.getBestParse()), pwOut); } } if (op.doDep && !Test.useFastFactored) { if (!dparser.parse(sentenceB)) { return parseSucceeded; } // cdm nov 2006: should move these printing bits to the main printing section, // so don't calculate the best parse twice! if (Test.verbose) { System.out.println("DParser output"); treePrint.printTree(dparser.getBestParse(), pwOut); } } if (op.doPCFG && op.doDep) { if (!bparser.parse(sentenceB)) { return parseSucceeded; } else { parseSucceeded = true; } } return true; }