Example usage for edu.stanford.nlp.process PTBEscapingProcessor PTBEscapingProcessor

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBEscapingProcessor PTBEscapingProcessor.

Prototype

public PTBEscapingProcessor()

Source Link

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java

License:Open Source License

@SuppressWarnings("unchecked")
public static <T extends HasWord> List<T> applyPtbEscaping(List<T> words, Collection<String> quoteBegin,
        Collection<String> quoteEnd) {
    PTBEscapingProcessor<T, String, Word> escaper = new PTBEscapingProcessor<T, String, Word>();
    // Apply escaper to the whole sentence, not to each token individually. The
    // escaper takes context into account, e.g. when transforming regular double
    // quotes into PTB opening and closing quotes (`` and '').
    words = (List<T>) escaper.apply(words);

    for (HasWord w : words) {
        if (quoteBegin != null && quoteBegin.contains(w.word())) {
            w.setWord("``");
        } else if (quoteEnd != null && quoteEnd.contains(w.word())) {
            w.setWord("\'\'");
        }//from   w  ww.  j  a v  a  2  s. co m
    }

    return words;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;//  www .  j a v  a2  s .  co  m
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java

License:Open Source License

public static List<HasWord> applyPtbEscaping(List<HasWord> words, Collection<String> quoteBegin,
        Collection<String> quoteEnd) {
    PTBEscapingProcessor<HasWord, String, Word> escaper = new PTBEscapingProcessor<HasWord, String, Word>();
    // Apply escaper to the whole sentence, not to each token individually. The
    // escaper takes context into account, e.g. when transforming regular double
    // quotes into PTB opening and closing quotes (`` and '').
    words = escaper.apply(words);//from   ww w . j  av a  2  s  .  c om

    for (HasWord w : words) {
        if (quoteBegin != null && quoteBegin.contains(w.word())) {
            w.setWord("``");
        } else if (quoteEnd != null && quoteEnd.contains(w.word())) {
            w.setWord("\'\'");
        }
    }

    return words;
}