List of usage examples for edu.stanford.nlp.process WordToSentenceProcessor process
@Override public List<List<IN>> process(List<? extends IN> words)
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java
License:Open Source License
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { List<Token> casTokens = null; // Use value from language parameter, document language or fallback language - whatever // is available String language = getLanguage(aJCas); if (isWriteToken()) { casTokens = new ArrayList<Token>(); final String text = aText; final Tokenizer<?> tokenizer = getTokenizer(language, aText); int offsetInSentence = 0; List<?> tokens = tokenizer.tokenize(); outer: for (int i = 0; i < tokens.size(); i++) { final Object token = tokens.get(i); // System.out.println("Token class: "+token.getClass()); String t = null;//from ww w .j a v a 2s .c o m if (token instanceof String) { t = (String) token; } if (token instanceof CoreLabel) { CoreLabel l = (CoreLabel) token; t = l.word(); int begin = l.get(CharacterOffsetBeginAnnotation.class); int end = l.get(CharacterOffsetEndAnnotation.class); casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i)); offsetInSentence = end; continue; } if (token instanceof Word) { Word w = (Word) token; t = w.word(); } if (t == null) { throw new AnalysisEngineProcessException( new IllegalStateException("Unknown token type: " + token.getClass())); } // Skip whitespace while (isWhitespace(text.charAt(offsetInSentence))) { offsetInSentence++; if (offsetInSentence >= text.length()) { break outer; } } // Match if (text.startsWith(t, offsetInSentence)) { casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence, aZoneBegin + offsetInSentence + t.length(), i)); offsetInSentence = offsetInSentence + t.length(); } else { // System.out.println(aText); throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: [" + t + "] CAS: [" + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length())))); } } } if (isWriteSentence()) { if (casTokens == null) { casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length()); } // Prepare the tokens for processing by WordToSentenceProcessor List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); for (Token token : casTokens) { CoreLabel l = new CoreLabel(); l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); l.setWord(token.getCoveredText()); tokensInDocument.add(l); } // The sentence splitter (probably) requires the escaped text, so we prepare it here PTBEscapingProcessor escaper = new PTBEscapingProcessor(); escaper.apply(tokensInDocument); // Apply the WordToSentenceProcessor to find the sentence boundaries WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex, boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences); List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); for (List<CoreLabel> sentence : sentencesInDocument) { int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class); createSentence(aJCas, begin, end); } } }
From source file:edu.iastate.airl.semtus.parser.Parser.java
License:Open Source License
/** * Get sentences from words//from w w w.j a va 2 s . com * * @param theseWords * words * @return list of sentences */ static public List<Sentence<Word>> getSentences(List<Word> theseWords) { WordToSentenceProcessor<Word, String, Word> thisSentenceProcessor = new WordToSentenceProcessor<Word, String, Word>(); List<List<Word>> theseProtoSentences = thisSentenceProcessor.process(theseWords); List<Sentence<Word>> theseSentences = new ArrayList<Sentence<Word>>(); for (List<Word> thisProtoSentence : theseProtoSentences) theseSentences.add(new Sentence<Word>(thisProtoSentence)); return theseSentences; }