Example usage for edu.stanford.nlp.process PTBEscapingProcessor PTBEscapingProcessor

List of usage examples for edu.stanford.nlp.process PTBEscapingProcessor PTBEscapingProcessor

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBEscapingProcessor PTBEscapingProcessor.

Prototype

public PTBEscapingProcessor() 

Source Link

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java

License:Open Source License

@SuppressWarnings("unchecked")
public static <T extends HasWord> List<T> applyPtbEscaping(List<T> words, Collection<String> quoteBegin,
        Collection<String> quoteEnd) {
    PTBEscapingProcessor<T, String, Word> escaper = new PTBEscapingProcessor<T, String, Word>();
    // Apply escaper to the whole sentence, not to each token individually. The
    // escaper takes context into account, e.g. when transforming regular double
    // quotes into PTB opening and closing quotes (`` and '').
    words = (List<T>) escaper.apply(words);

    for (HasWord w : words) {
        if (quoteBegin != null && quoteBegin.contains(w.word())) {
            w.setWord("``");
        } else if (quoteEnd != null && quoteEnd.contains(w.word())) {
            w.setWord("\'\'");
        }//from   w  ww.  j  a v  a  2  s. co m
    }

    return words;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;//  www .  j a v  a2  s .  co  m
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java

License:Open Source License

public static List<HasWord> applyPtbEscaping(List<HasWord> words, Collection<String> quoteBegin,
        Collection<String> quoteEnd) {
    PTBEscapingProcessor<HasWord, String, Word> escaper = new PTBEscapingProcessor<HasWord, String, Word>();
    // Apply escaper to the whole sentence, not to each token individually. The
    // escaper takes context into account, e.g. when transforming regular double
    // quotes into PTB opening and closing quotes (`` and '').
    words = escaper.apply(words);//from   ww w . j  av a  2  s  .  c om

    for (HasWord w : words) {
        if (quoteBegin != null && quoteBegin.contains(w.word())) {
            w.setWord("``");
        } else if (quoteEnd != null && quoteEnd.contains(w.word())) {
            w.setWord("\'\'");
        }
    }

    return words;
}