Example usage for edu.stanford.nlp.ling CoreLabel set

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel set.

Prototype

@Override
@SuppressWarnings("unchecked")
public <VALUE> VALUE set(Class<? extends Key<VALUE>> key, VALUE value)

Source Link

Usage

From source file:ca.ualberta.exemplar.core.CleanPrefixAnnotator.java

License:Open Source License

@Override
public void annotate(Annotation document) {
    if (document.has(SentencesAnnotation.class)) {
        for (CoreMap sentence : document.get(SentencesAnnotation.class)) {

            List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
            int numTokens = 0, numPrefixParts = 0;

            // Assumption: prefix is at max 10 tokens
            for (int i = 0; i < Math.min(tokens.size(), 10); i++) {

                CoreLabel token = tokens.get(i);
                String tokenText = token.get(TextAnnotation.class);

                if (tokenText != null && numTokens > 0 && (tokenText.equals("--") || tokenText.equals(":"))) {
                    // Assumption: if more than half the tokens are a date/location/number it's a prefix
                    double fraction = (double) numPrefixParts / (double) numTokens;
                    if (fraction > 0.5) {
                        CoreLabel nextToken = tokens.get(i + 1);
                        String before = document.get(TextAnnotation.class).substring(0,
                                nextToken.beginPosition());
                        nextToken.set(BeforeAnnotation.class, before);
                        sentence.set(TokensAnnotation.class, tokens.subList(i + 1, tokens.size()));
                        //System.out.println("Removed Prefix: " + before);
                    }//from ww  w. ja va  2 s  . c om
                    break;
                }

                numTokens++;
                String neTag = token.ner();
                if (neTag != null && (neTag.equals("DATE") || neTag.equals("LOCATION") || neTag.equals("NUMBER")
                        || neTag.equals("ORDINAL"))) {
                    numPrefixParts++;
                }
            }
        }
    }
}

From source file:ca.ualberta.exemplar.core.LocationJuxtapositionAnnotator.java

License:Open Source License

@Override
public void annotate(Annotation document) {
    if (document.has(TokensAnnotation.class)) {
        String[] ner = new String[3];
        List<CoreLabel> tokens = document.get(TokensAnnotation.class);
        CoreLabel prev = null;
        for (CoreLabel token : tokens) {
            ner[0] = ner[1];/*from   w  ww . j  a  va  2s  . c om*/
            ner[1] = ner[2];
            ner[2] = token.get(NamedEntityTagAnnotation.class);

            if (ner[1] != null && !ner[1].equals("O") && ner[2] != null && !ner[2].equals("O")) {
                // Two named entities in a row
            } else if (ner[0] != null && !ner[0].equals("O") && ner[2] != null && !ner[2].equals("O")
                    && prev.get(TextAnnotation.class).equals(",")) {
                //Named entity comma named entity
                String textRep = ner[0] + "," + ner[2];
                //System.out.println(textRep);
                if (nerPairs.containsKey(textRep)) {
                    nerPairs.put(textRep, nerPairs.get(textRep) + 1);
                } else {
                    nerPairs.put(textRep, 1);
                }
                //System.out.println(nerPairs);
                if (ner[0].equals("LOCATION") && ner[2].equals("LOCATION")) {
                    prev.set(TextAnnotation.class, "and");
                    prev.set(ValueAnnotation.class, "and");
                    prev.set(PartOfSpeechAnnotation.class, "CC");
                    prev.set(LemmaAnnotation.class, "and");
                }
            }

            prev = token;
        }
    }
}

From source file:ca.ualberta.exemplar.core.RemoveDashesAnnotator.java

License:Open Source License

@Override
public void annotate(Annotation document) {
    if (document.has(SentencesAnnotation.class)) {
        for (CoreMap sentence : document.get(SentencesAnnotation.class)) {

            List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
            for (CoreLabel token : tokens) {
                if (token.get(TextAnnotation.class).equals("--")) {
                    token.set(TextAnnotation.class, ",");
                    token.set(ValueAnnotation.class, ",");
                }/* www .  j a v a  2s  .  c o m*/
            }
        }
    }
}

From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java

License:Open Source License

@Override
public void annotate(Annotation annotation) {
    if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) {
        List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
        for (CoreLabel token : tokens) {
            boolean isWordStopword = stopwords.contains(token.word().toLowerCase());
            boolean isLemmaStopword = checkLemma ? stopwords.contains(token.lemma().toLowerCase()) : false;
            Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword);
            token.set(StopwordAnnotator.class, pair);
        }/* w  ww  .  j  a  v a  2s. c  o m*/
    }
}

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();//from w w w .j ava 2  s  .  c  o  m
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible")
                .tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.valueOf(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.valueOf(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // term things

    List<List<Mention>> termMentions = new ArrayList<List<Mention>>();

    if (use_term) {
        String dataCrf = "";
        System.err.print("FEAT TYPE: ");
        System.err
                .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP));
        if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)
                .equals(MyConstants.TTE_FEATURE_NLTK)) {
            dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno);
        } else {
            dataCrf = CrfFormatter.annotationToCrfString(docAnno);
        }
        List<List<String>> tagResult = new ArrayList<List<String>>();

        try {
            tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL));

            if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) {
                String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA);

                File crfDataFile = new File(crfDataFilename);
                BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile));
                bw.write(dataCrf);
                bw.close();
            }

        } catch (Exception e) {
            System.err.println("Crfsuite tag failed");
        }

        termAsMentionFinder.setTags(tagResult);
        termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

        maxID = termAsMentionFinder.getMaxID();
    }

    // extract predicted mentions

    allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) {
        termMentions = injectPronoun(termMentions, allPredictedMentions);
    }

    if (experimentType != null) {
        if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) {
            List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) {
            allPredictedMentions = termMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else {
            System.err.println(experimentType);
            System.err.println("Unknown experiment type. Using mention detector.");
        }
    } else if (useGoldMention) {
        allPredictedMentions = allGoldMentions;
    }

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

private void distSimAnnotate(PaddedList<IN> info) {
    for (CoreLabel fl : info) {
        if (fl.has(CoreAnnotations.DistSimAnnotation.class)) {
            return;
        }/*from w w w  .j  a  va  2  s. c om*/
        String word = getWord(fl);
        if (!flags.casedDistSim) {
            word = word.toLowerCase();
        }
        if (flags.numberEquivalenceDistSim) {
            word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
        }
        String distSim = lexicon.get(word);
        if (distSim == null) {
            distSim = flags.unknownWordDistSimClass;
        }
        fl.set(CoreAnnotations.DistSimAnnotation.class, distSim);
    }
}

From source file:de.l3s.workive.analysis.ner.GermanNER.java

public List<Entity> extractEntities(CoreMap sentence) {
    List<Entity> entityList = new ArrayList<Entity>();

    CoreLabel prevEntity = null;
    String tag = "";

    for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        String entityTag = token.get(NamedEntityTagAnnotation.class);

        //System.out.println(entityTag);
        if (entityTag.compareToIgnoreCase("I-ORG") == 0 || entityTag.compareToIgnoreCase("I-PER") == 0
                || entityTag.compareToIgnoreCase("I-LOC") == 0 || entityTag.compareToIgnoreCase("MISC") == 0) {

            if (prevEntity != null) {
                if (prevEntity.get(NamedEntityTagAnnotation.class).compareToIgnoreCase(entityTag) == 0
                        && prevEntity.endPosition() == token.beginPosition() - 1) {
                    prevEntity.setEndPosition(token.endPosition());
                    prevEntity.set(TextAnnotation.class,
                            prevEntity.get(TextAnnotation.class) + " " + token.get(TextAnnotation.class));
                } else {
                    Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(
                            prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(),
                            prevEntity.endPosition());
                    entityList.add(new Entity(triple, tag));
                    prevEntity = token;/*from   ww w  .j  a  v a2 s  . co m*/
                    tag = entityTag;

                }
            } else {
                prevEntity = token;
                tag = entityTag;
            }
        }
    }

    if (prevEntity != null) {

        Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(
                prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition());
        entityList.add(new Entity(triple, tag));
        tag = "";
    }
    return entityList;
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java

License:Open Source License

public Annotation convert(JCas aSource, Annotation aTarget) {
    // Document annotation
    aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText());

    // Sentences//ww w  .j  a  va2 s. com
    List<CoreMap> sentences = new ArrayList<>();
    for (Sentence s : select(aSource, Sentence.class)) {
        if (StringUtils.isBlank(s.getCoveredText())) {
            continue;
        }

        String sentenceText = s.getCoveredText();
        if (encoding != null && !"UTF-8".equals(encoding.name())) {
            sentenceText = new String(sentenceText.getBytes(StandardCharsets.UTF_8), encoding);
        }

        Annotation sentence = new Annotation(sentenceText);
        sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin());
        sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd());
        sentence.set(SentenceIndexAnnotation.class, sentences.size());

        // Tokens
        Map<Token, IndexedWord> idxTokens = new HashMap<>();
        List<CoreLabel> tokens = new ArrayList<>();
        for (Token t : selectCovered(Token.class, s)) {
            String tokenText = t.getCoveredText();
            if (encoding != null && !"UTF-8".equals(encoding.name())) {
                tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding);
            }

            CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), t.getEnd() - t.getBegin());
            // First add token so that tokens.size() returns a 1-based counting as required
            // by IndexAnnotation
            tokens.add(token);
            token.set(SentenceIndexAnnotation.class, sentences.size());
            token.set(IndexAnnotation.class, tokens.size());
            token.set(TokenKey.class, t);
            idxTokens.put(t, new IndexedWord(token));

            // POS tags
            if (readPos && t.getPos() != null) {
                token.set(PartOfSpeechAnnotation.class, t.getPos().getPosValue());
            }

            // Lemma
            if (t.getLemma() != null) {
                token.set(LemmaAnnotation.class, t.getLemma().getValue());
            }

            // Stem
            if (t.getStem() != null) {
                token.set(StemAnnotation.class, t.getStem().getValue());
            }

            // NamedEntity
            // TODO: only token-based NEs are supported, but not multi-token NEs
            // Supporting multi-token NEs via selectCovering would be very slow. To support
            // them, another approach would need to be implemented, e.g. via indexCovering.
            List<NamedEntity> nes = selectCovered(NamedEntity.class, t);
            if (nes.size() > 0) {
                token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue());
            } else {
                token.set(NamedEntityTagAnnotation.class, "O");
            }
        }

        // Constituents
        for (ROOT r : selectCovered(ROOT.class, s)) {
            Tree tree = createStanfordTree(r, idxTokens);
            tree.indexSpans();
            sentence.set(TreeAnnotation.class, tree);
        }

        // Dependencies
        List<TypedDependency> dependencies = new ArrayList<>();
        for (Dependency d : selectCovered(Dependency.class, s)) {
            TypedDependency dep = new TypedDependency(GrammaticalRelation.valueOf(d.getDependencyType()),
                    idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent()));
            if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) {
                dep.setExtra();
            }
            dependencies.add(dep);
        }
        sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies));

        if (ptb3Escaping) {
            tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd);
        }

        sentence.set(TokensAnnotation.class, tokens);
        sentences.add(sentence);
    }
    aTarget.set(SentencesAnnotation.class, sentences);

    return aTarget;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java

License:Open Source License

protected CoreLabel tokenToWord(Token aToken) {
    CoreLabel t = CoreNlpUtils.tokenToWord(aToken);
    t.set(TokenKey.class, aToken);
    List<NamedEntity> nes = selectCovered(NamedEntity.class, aToken);
    if (nes.size() > 0) {
        t.setNER(nes.get(0).getValue());
    } else {/*from w  ww. j a  v a 2  s  .  c  o  m*/
        t.setNER("O");
    }
    return t;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;//  w w w .j av  a 2s.  com
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}