List of usage examples for edu.stanford.nlp.ling CoreLabel set
@Override @SuppressWarnings("unchecked") public <VALUE> VALUE set(Class<? extends Key<VALUE>> key, VALUE value)
From source file:ca.ualberta.exemplar.core.CleanPrefixAnnotator.java
License:Open Source License
@Override public void annotate(Annotation document) { if (document.has(SentencesAnnotation.class)) { for (CoreMap sentence : document.get(SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); int numTokens = 0, numPrefixParts = 0; // Assumption: prefix is at max 10 tokens for (int i = 0; i < Math.min(tokens.size(), 10); i++) { CoreLabel token = tokens.get(i); String tokenText = token.get(TextAnnotation.class); if (tokenText != null && numTokens > 0 && (tokenText.equals("--") || tokenText.equals(":"))) { // Assumption: if more than half the tokens are a date/location/number it's a prefix double fraction = (double) numPrefixParts / (double) numTokens; if (fraction > 0.5) { CoreLabel nextToken = tokens.get(i + 1); String before = document.get(TextAnnotation.class).substring(0, nextToken.beginPosition()); nextToken.set(BeforeAnnotation.class, before); sentence.set(TokensAnnotation.class, tokens.subList(i + 1, tokens.size())); //System.out.println("Removed Prefix: " + before); }//from ww w. ja va 2 s . c om break; } numTokens++; String neTag = token.ner(); if (neTag != null && (neTag.equals("DATE") || neTag.equals("LOCATION") || neTag.equals("NUMBER") || neTag.equals("ORDINAL"))) { numPrefixParts++; } } } } }
From source file:ca.ualberta.exemplar.core.LocationJuxtapositionAnnotator.java
License:Open Source License
@Override public void annotate(Annotation document) { if (document.has(TokensAnnotation.class)) { String[] ner = new String[3]; List<CoreLabel> tokens = document.get(TokensAnnotation.class); CoreLabel prev = null; for (CoreLabel token : tokens) { ner[0] = ner[1];/*from w ww . j a va 2s . c om*/ ner[1] = ner[2]; ner[2] = token.get(NamedEntityTagAnnotation.class); if (ner[1] != null && !ner[1].equals("O") && ner[2] != null && !ner[2].equals("O")) { // Two named entities in a row } else if (ner[0] != null && !ner[0].equals("O") && ner[2] != null && !ner[2].equals("O") && prev.get(TextAnnotation.class).equals(",")) { //Named entity comma named entity String textRep = ner[0] + "," + ner[2]; //System.out.println(textRep); if (nerPairs.containsKey(textRep)) { nerPairs.put(textRep, nerPairs.get(textRep) + 1); } else { nerPairs.put(textRep, 1); } //System.out.println(nerPairs); if (ner[0].equals("LOCATION") && ner[2].equals("LOCATION")) { prev.set(TextAnnotation.class, "and"); prev.set(ValueAnnotation.class, "and"); prev.set(PartOfSpeechAnnotation.class, "CC"); prev.set(LemmaAnnotation.class, "and"); } } prev = token; } } }
From source file:ca.ualberta.exemplar.core.RemoveDashesAnnotator.java
License:Open Source License
@Override public void annotate(Annotation document) { if (document.has(SentencesAnnotation.class)) { for (CoreMap sentence : document.get(SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); for (CoreLabel token : tokens) { if (token.get(TextAnnotation.class).equals("--")) { token.set(TextAnnotation.class, ","); token.set(ValueAnnotation.class, ","); }/* www . j a v a 2s . c o m*/ } } } }
From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java
License:Open Source License
@Override public void annotate(Annotation annotation) { if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); for (CoreLabel token : tokens) { boolean isWordStopword = stopwords.contains(token.word().toLowerCase()); boolean isLemmaStopword = checkLemma ? stopwords.contains(token.lemma().toLowerCase()) : false; Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword); token.set(StopwordAnnotator.class, pair); }/* w ww . j a v a 2s. c o m*/ } }
From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();//from w w w .j ava 2 s . c o m String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible") .tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.valueOf(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.valueOf(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // term things List<List<Mention>> termMentions = new ArrayList<List<Mention>>(); if (use_term) { String dataCrf = ""; System.err.print("FEAT TYPE: "); System.err .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)); if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP) .equals(MyConstants.TTE_FEATURE_NLTK)) { dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno); } else { dataCrf = CrfFormatter.annotationToCrfString(docAnno); } List<List<String>> tagResult = new ArrayList<List<String>>(); try { tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL)); if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) { String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA); File crfDataFile = new File(crfDataFilename); BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile)); bw.write(dataCrf); bw.close(); } } catch (Exception e) { System.err.println("Crfsuite tag failed"); } termAsMentionFinder.setTags(tagResult); termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); maxID = termAsMentionFinder.getMaxID(); } // extract predicted mentions allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) { termMentions = injectPronoun(termMentions, allPredictedMentions); } if (experimentType != null) { if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) { List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) { List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) { allPredictedMentions = termMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) { List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else { System.err.println(experimentType); System.err.println("Unknown experiment type. Using mention detector."); } } else if (useGoldMention) { allPredictedMentions = allGoldMentions; } // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
private void distSimAnnotate(PaddedList<IN> info) { for (CoreLabel fl : info) { if (fl.has(CoreAnnotations.DistSimAnnotation.class)) { return; }/*from w w w .j a va 2 s. c om*/ String word = getWord(fl); if (!flags.casedDistSim) { word = word.toLowerCase(); } if (flags.numberEquivalenceDistSim) { word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS); } String distSim = lexicon.get(word); if (distSim == null) { distSim = flags.unknownWordDistSimClass; } fl.set(CoreAnnotations.DistSimAnnotation.class, distSim); } }
From source file:de.l3s.workive.analysis.ner.GermanNER.java
public List<Entity> extractEntities(CoreMap sentence) { List<Entity> entityList = new ArrayList<Entity>(); CoreLabel prevEntity = null; String tag = ""; for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String entityTag = token.get(NamedEntityTagAnnotation.class); //System.out.println(entityTag); if (entityTag.compareToIgnoreCase("I-ORG") == 0 || entityTag.compareToIgnoreCase("I-PER") == 0 || entityTag.compareToIgnoreCase("I-LOC") == 0 || entityTag.compareToIgnoreCase("MISC") == 0) { if (prevEntity != null) { if (prevEntity.get(NamedEntityTagAnnotation.class).compareToIgnoreCase(entityTag) == 0 && prevEntity.endPosition() == token.beginPosition() - 1) { prevEntity.setEndPosition(token.endPosition()); prevEntity.set(TextAnnotation.class, prevEntity.get(TextAnnotation.class) + " " + token.get(TextAnnotation.class)); } else { Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>( prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition()); entityList.add(new Entity(triple, tag)); prevEntity = token;/*from ww w .j a v a2 s . co m*/ tag = entityTag; } } else { prevEntity = token; tag = entityTag; } } } if (prevEntity != null) { Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>( prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition()); entityList.add(new Entity(triple, tag)); tag = ""; } return entityList; }
From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java
License:Open Source License
public Annotation convert(JCas aSource, Annotation aTarget) { // Document annotation aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText()); // Sentences//ww w .j a va2 s. com List<CoreMap> sentences = new ArrayList<>(); for (Sentence s : select(aSource, Sentence.class)) { if (StringUtils.isBlank(s.getCoveredText())) { continue; } String sentenceText = s.getCoveredText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { sentenceText = new String(sentenceText.getBytes(StandardCharsets.UTF_8), encoding); } Annotation sentence = new Annotation(sentenceText); sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin()); sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd()); sentence.set(SentenceIndexAnnotation.class, sentences.size()); // Tokens Map<Token, IndexedWord> idxTokens = new HashMap<>(); List<CoreLabel> tokens = new ArrayList<>(); for (Token t : selectCovered(Token.class, s)) { String tokenText = t.getCoveredText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding); } CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), t.getEnd() - t.getBegin()); // First add token so that tokens.size() returns a 1-based counting as required // by IndexAnnotation tokens.add(token); token.set(SentenceIndexAnnotation.class, sentences.size()); token.set(IndexAnnotation.class, tokens.size()); token.set(TokenKey.class, t); idxTokens.put(t, new IndexedWord(token)); // POS tags if (readPos && t.getPos() != null) { token.set(PartOfSpeechAnnotation.class, t.getPos().getPosValue()); } // Lemma if (t.getLemma() != null) { token.set(LemmaAnnotation.class, t.getLemma().getValue()); } // Stem if (t.getStem() != null) { token.set(StemAnnotation.class, t.getStem().getValue()); } // NamedEntity // TODO: only token-based NEs are supported, but not multi-token NEs // Supporting multi-token NEs via selectCovering would be very slow. To support // them, another approach would need to be implemented, e.g. via indexCovering. List<NamedEntity> nes = selectCovered(NamedEntity.class, t); if (nes.size() > 0) { token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue()); } else { token.set(NamedEntityTagAnnotation.class, "O"); } } // Constituents for (ROOT r : selectCovered(ROOT.class, s)) { Tree tree = createStanfordTree(r, idxTokens); tree.indexSpans(); sentence.set(TreeAnnotation.class, tree); } // Dependencies List<TypedDependency> dependencies = new ArrayList<>(); for (Dependency d : selectCovered(Dependency.class, s)) { TypedDependency dep = new TypedDependency(GrammaticalRelation.valueOf(d.getDependencyType()), idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent())); if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) { dep.setExtra(); } dependencies.add(dep); } sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies)); if (ptb3Escaping) { tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd); } sentence.set(TokensAnnotation.class, tokens); sentences.add(sentence); } aTarget.set(SentencesAnnotation.class, sentences); return aTarget; }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java
License:Open Source License
protected CoreLabel tokenToWord(Token aToken) { CoreLabel t = CoreNlpUtils.tokenToWord(aToken); t.set(TokenKey.class, aToken); List<NamedEntity> nes = selectCovered(NamedEntity.class, aToken); if (nes.size() > 0) { t.setNER(nes.get(0).getValue()); } else {/*from w ww. j a v a 2 s . c o m*/ t.setNER("O"); } return t; }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java
License:Open Source License
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { List<Token> casTokens = null; // Use value from language parameter, document language or fallback language - whatever // is available String language = getLanguage(aJCas); if (isWriteToken()) { casTokens = new ArrayList<Token>(); final String text = aText; final Tokenizer<?> tokenizer = getTokenizer(language, aText); int offsetInSentence = 0; List<?> tokens = tokenizer.tokenize(); outer: for (int i = 0; i < tokens.size(); i++) { final Object token = tokens.get(i); // System.out.println("Token class: "+token.getClass()); String t = null;// w w w .j av a 2s. com if (token instanceof String) { t = (String) token; } if (token instanceof CoreLabel) { CoreLabel l = (CoreLabel) token; t = l.word(); int begin = l.get(CharacterOffsetBeginAnnotation.class); int end = l.get(CharacterOffsetEndAnnotation.class); casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i)); offsetInSentence = end; continue; } if (token instanceof Word) { Word w = (Word) token; t = w.word(); } if (t == null) { throw new AnalysisEngineProcessException( new IllegalStateException("Unknown token type: " + token.getClass())); } // Skip whitespace while (isWhitespace(text.charAt(offsetInSentence))) { offsetInSentence++; if (offsetInSentence >= text.length()) { break outer; } } // Match if (text.startsWith(t, offsetInSentence)) { casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence, aZoneBegin + offsetInSentence + t.length(), i)); offsetInSentence = offsetInSentence + t.length(); } else { // System.out.println(aText); throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: [" + t + "] CAS: [" + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length())))); } } } if (isWriteSentence()) { if (casTokens == null) { casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length()); } // Prepare the tokens for processing by WordToSentenceProcessor List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); for (Token token : casTokens) { CoreLabel l = new CoreLabel(); l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); l.setWord(token.getCoveredText()); tokensInDocument.add(l); } // The sentence splitter (probably) requires the escaped text, so we prepare it here PTBEscapingProcessor escaper = new PTBEscapingProcessor(); escaper.apply(tokensInDocument); // Apply the WordToSentenceProcessor to find the sentence boundaries WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex, boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences); List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); for (List<CoreLabel> sentence : sentencesInDocument) { int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class); createSentence(aJCas, begin, end); } } }