List of usage examples for edu.stanford.nlp.parser.lexparser Lexicon UNKNOWN_WORD
String UNKNOWN_WORD
To view the source code for edu.stanford.nlp.parser.lexparser Lexicon UNKNOWN_WORD.
Click Source Link
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
public boolean parse(List<? extends HasWord> sentence) { if (tf == null) { tf = new LabeledScoredTreeFactory(); }//from w w w . j ava2s .c o m lr = null; // better nullPointer exception than silent error //System.out.println("is it a taggedword?" + (sentence.get(0) instanceof TaggedWord)); //debugging if (sentence != this.sentence) { this.sentence = sentence; floodTags = false; } if (op.testOptions.verbose) { Timing.tick("Starting pcfg parse."); } if (spillGuts) { tick("Starting PCFG parse..."); } length = sentence.size(); if (length > arraySize) { considerCreatingArrays(length); } int goal = stateIndex.indexOf(goalStr); if (op.testOptions.verbose) { // System.out.println(numStates + " states, " + goal + " is the goal state."); // System.err.println(new ArrayList(ug.coreRules.keySet())); System.err.print("Initializing PCFG..."); } // map input words to words array (wordIndex ints) words = new int[length]; beginOffsets = new int[length]; endOffsets = new int[length]; originalCoreLabels = new CoreLabel[length]; originalTags = new HasTag[length]; int unk = 0; StringBuilder unkWords = new StringBuilder("["); // int unkIndex = wordIndex.size(); for (int i = 0; i < length; i++) { String s = sentence.get(i).word(); if (sentence.get(i) instanceof HasOffset) { HasOffset word = (HasOffset) sentence.get(i); beginOffsets[i] = word.beginPosition(); endOffsets[i] = word.endPosition(); } else { //Storing the positions of the word interstices //Account for single space between words beginOffsets[i] = ((i == 0) ? 0 : endOffsets[i - 1] + 1); endOffsets[i] = beginOffsets[i] + s.length(); } if (sentence.get(i) instanceof CoreLabel) { originalCoreLabels[i] = (CoreLabel) sentence.get(i); } if (sentence.get(i) instanceof HasTag) { originalTags[i] = (HasTag) sentence.get(i); } if (op.testOptions.verbose && (!wordIndex.contains(s) || !lex.isKnown(wordIndex.indexOf(s)))) { unk++; unkWords.append(' '); unkWords.append(s); unkWords.append(" { "); for (int jj = 0; jj < s.length(); jj++) { char ch = s.charAt(jj); unkWords.append(Character.getType(ch)).append(" "); } unkWords.append("}"); } // TODO: really, add a new word? //words[i] = wordIndex.indexOf(s, unkIndex); //if (words[i] == unkIndex) { // ++unkIndex; //} //words[i] = wordIndex.indexOf(s, true); if (wordIndex.contains(s)) { words[i] = wordIndex.indexOf(s); } else { words[i] = wordIndex.indexOf(Lexicon.UNKNOWN_WORD); } } // initialize inside and outside score arrays if (spillGuts) { tick("Wiping arrays..."); } for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { Arrays.fill(iScore[start][end], Float.NEGATIVE_INFINITY); if (op.doDep && !op.testOptions.useFastFactored) { Arrays.fill(oScore[start][end], Float.NEGATIVE_INFINITY); } if (op.testOptions.lengthNormalization) { Arrays.fill(wordsInSpan[start][end], 1); } } } for (int loc = 0; loc <= length; loc++) { Arrays.fill(narrowLExtent[loc], -1); // the rightmost left with state s ending at i that we can get is the beginning Arrays.fill(wideLExtent[loc], length + 1); // the leftmost left with state s ending at i that we can get is the end } for (int loc = 0; loc < length; loc++) { Arrays.fill(narrowRExtent[loc], length + 1); // the leftmost right with state s starting at i that we can get is the end Arrays.fill(wideRExtent[loc], -1); // the rightmost right with state s starting at i that we can get is the beginning } // int puncTag = stateIndex.indexOf("."); // boolean lastIsPunc = false; if (op.testOptions.verbose) { Timing.tick("done."); unkWords.append(" ]"); op.tlpParams.pw(System.err).println("Unknown words: " + unk + " " + unkWords); System.err.print("Starting filters..."); } // do tags if (spillGuts) { tick("Tagging..."); } initializeChart(sentence); //if (op.testOptions.outsideFilter) // buildOFilter(); if (op.testOptions.verbose) { Timing.tick("done."); System.err.print("Starting insides..."); } // do the inside probabilities doInsideScores(); if (op.testOptions.verbose) { // insideTime += Timing.tick("done."); Timing.tick("done."); System.out.println( "PCFG parsing " + length + " words (incl. stop): insideScore = " + iScore[0][length][goal]); } bestScore = iScore[0][length][goal]; boolean succeeded = hasParse(); if (op.testOptions.doRecovery && !succeeded && !floodTags) { floodTags = true; // sentence will try to reparse // ms: disabled message. this is annoying and it doesn't really provide much information //System.err.println("Trying recovery parse..."); return parse(sentence); } if (!op.doDep || op.testOptions.useFastFactored) { return succeeded; } if (op.testOptions.verbose) { System.err.print("Starting outsides..."); } // outside scores oScore[0][length][goal] = 0.0f; doOutsideScores(); //System.out.println("State rate: "+((int)(1000*ohits/otries))/10.0); //System.out.println("Traversals: "+ohits); if (op.testOptions.verbose) { // outsideTime += Timing.tick("Done."); Timing.tick("done."); } if (op.doDep) { initializePossibles(); } return succeeded; }