Example usage for edu.stanford.nlp.ling HasOffset beginPosition

List of usage examples for edu.stanford.nlp.ling HasOffset beginPosition

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling HasOffset beginPosition.

Prototype

int beginPosition();

Source Link

Document

Return the beginning char offset of the label (or -1 if none).

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

public boolean parse(List<? extends HasWord> sentence) {
    if (tf == null) {
        tf = new LabeledScoredTreeFactory();
    }/*  w  w w .java  2s. c  o m*/
    lr = null; // better nullPointer exception than silent error
    //System.out.println("is it a taggedword?" + (sentence.get(0) instanceof TaggedWord)); //debugging
    if (sentence != this.sentence) {
        this.sentence = sentence;
        floodTags = false;
    }
    if (op.testOptions.verbose) {
        Timing.tick("Starting pcfg parse.");
    }
    if (spillGuts) {
        tick("Starting PCFG parse...");
    }
    length = sentence.size();
    if (length > arraySize) {
        considerCreatingArrays(length);
    }
    int goal = stateIndex.indexOf(goalStr);
    if (op.testOptions.verbose) {
        // System.out.println(numStates + " states, " + goal + " is the goal state.");
        // System.err.println(new ArrayList(ug.coreRules.keySet()));
        System.err.print("Initializing PCFG...");
    }
    // map input words to words array (wordIndex ints)
    words = new int[length];
    beginOffsets = new int[length];
    endOffsets = new int[length];
    originalCoreLabels = new CoreLabel[length];
    originalTags = new HasTag[length];
    int unk = 0;
    StringBuilder unkWords = new StringBuilder("[");
    // int unkIndex = wordIndex.size();
    for (int i = 0; i < length; i++) {
        String s = sentence.get(i).word();

        if (sentence.get(i) instanceof HasOffset) {
            HasOffset word = (HasOffset) sentence.get(i);
            beginOffsets[i] = word.beginPosition();
            endOffsets[i] = word.endPosition();
        } else {
            //Storing the positions of the word interstices
            //Account for single space between words
            beginOffsets[i] = ((i == 0) ? 0 : endOffsets[i - 1] + 1);
            endOffsets[i] = beginOffsets[i] + s.length();
        }

        if (sentence.get(i) instanceof CoreLabel) {
            originalCoreLabels[i] = (CoreLabel) sentence.get(i);
        }
        if (sentence.get(i) instanceof HasTag) {
            originalTags[i] = (HasTag) sentence.get(i);
        }

        if (op.testOptions.verbose && (!wordIndex.contains(s) || !lex.isKnown(wordIndex.indexOf(s)))) {
            unk++;
            unkWords.append(' ');
            unkWords.append(s);
            unkWords.append(" { ");
            for (int jj = 0; jj < s.length(); jj++) {
                char ch = s.charAt(jj);
                unkWords.append(Character.getType(ch)).append(" ");
            }
            unkWords.append("}");
        }
        // TODO: really, add a new word?
        //words[i] = wordIndex.indexOf(s, unkIndex);
        //if (words[i] == unkIndex) {
        //  ++unkIndex;
        //}
        //words[i] = wordIndex.indexOf(s, true);
        if (wordIndex.contains(s)) {
            words[i] = wordIndex.indexOf(s);
        } else {
            words[i] = wordIndex.indexOf(Lexicon.UNKNOWN_WORD);
        }
    }

    // initialize inside and outside score arrays
    if (spillGuts) {
        tick("Wiping arrays...");
    }
    for (int start = 0; start < length; start++) {
        for (int end = start + 1; end <= length; end++) {
            Arrays.fill(iScore[start][end], Float.NEGATIVE_INFINITY);
            if (op.doDep && !op.testOptions.useFastFactored) {
                Arrays.fill(oScore[start][end], Float.NEGATIVE_INFINITY);
            }
            if (op.testOptions.lengthNormalization) {
                Arrays.fill(wordsInSpan[start][end], 1);
            }
        }
    }
    for (int loc = 0; loc <= length; loc++) {
        Arrays.fill(narrowLExtent[loc], -1); // the rightmost left with state s ending at i that we can get is the beginning
        Arrays.fill(wideLExtent[loc], length + 1); // the leftmost left with state s ending at i that we can get is the end
    }
    for (int loc = 0; loc < length; loc++) {
        Arrays.fill(narrowRExtent[loc], length + 1); // the leftmost right with state s starting at i that we can get is the end
        Arrays.fill(wideRExtent[loc], -1); // the rightmost right with state s starting at i that we can get is the beginning
    }
    // int puncTag = stateIndex.indexOf(".");
    // boolean lastIsPunc = false;
    if (op.testOptions.verbose) {
        Timing.tick("done.");
        unkWords.append(" ]");
        op.tlpParams.pw(System.err).println("Unknown words: " + unk + " " + unkWords);
        System.err.print("Starting filters...");
    }
    // do tags
    if (spillGuts) {
        tick("Tagging...");
    }
    initializeChart(sentence);
    //if (op.testOptions.outsideFilter)
    // buildOFilter();
    if (op.testOptions.verbose) {
        Timing.tick("done.");
        System.err.print("Starting insides...");
    }
    // do the inside probabilities
    doInsideScores();
    if (op.testOptions.verbose) {
        // insideTime += Timing.tick("done.");
        Timing.tick("done.");
        System.out.println(
                "PCFG parsing " + length + " words (incl. stop): insideScore = " + iScore[0][length][goal]);
    }
    bestScore = iScore[0][length][goal];
    boolean succeeded = hasParse();
    if (op.testOptions.doRecovery && !succeeded && !floodTags) {
        floodTags = true; // sentence will try to reparse
        // ms: disabled message. this is annoying and it doesn't really provide much information
        //System.err.println("Trying recovery parse...");
        return parse(sentence);
    }
    if (!op.doDep || op.testOptions.useFastFactored) {
        return succeeded;
    }
    if (op.testOptions.verbose) {
        System.err.print("Starting outsides...");
    }
    // outside scores
    oScore[0][length][goal] = 0.0f;
    doOutsideScores();
    //System.out.println("State rate: "+((int)(1000*ohits/otries))/10.0);
    //System.out.println("Traversals: "+ohits);
    if (op.testOptions.verbose) {
        // outsideTime += Timing.tick("Done.");
        Timing.tick("done.");
    }

    if (op.doDep) {
        initializePossibles();
    }

    return succeeded;
}

From source file:nlpedit.core.NLPProject.java

License:Open Source License

private void setupBoundaries() {
    StringReader reader = new StringReader(document);
    DocumentPreprocessor processor = new DocumentPreprocessor(reader);
    int sentCount = 0;

    boundaryMap = new TreeMap<Integer, Integer>();
    boundaryArray = new Vector<Integer>();

    for (List<HasWord> sentence : processor) {
        if (sentence.size() == 0)
            continue;
        if (sentCount == 0) {
            boundaryMap.put(0, 0);//from w  w w  . ja  v  a 2 s.  c  o  m
            boundaryArray.add(0);
        } else {
            HasOffset first = (HasOffset) sentence.get(0);
            boundaryMap.put(first.beginPosition(), sentCount);
            boundaryArray.add(first.beginPosition());
        }
        sentCount++;
    }
    boundaryMap.put(document.length(), sentCount);
    boundaryArray.add(document.length());
    numSentence = sentCount;
}