Example usage for edu.stanford.nlp.ling CoreLabel endPosition

List of usage examples for edu.stanford.nlp.ling CoreLabel endPosition

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel endPosition.

Prototype

@Override
public int endPosition() 

Source Link

Usage

From source file:ca.ualberta.entitylinking.common.indexing.Tokenizer.java

License:Open Source License

/**
 * Tokenize the given text using a Stanford NLP toolkit.
 * The tokenized results are a mix of non-overlapping tokens and named entities.
 * e.g. University of Alberta is a great university in Canada.
 * tokens: "University of Alberta", "is", "a", "great", "university", "in", "Canada".
 *
 * @param text/*ww  w. j av  a 2  s  .co  m*/
 * @return
 */
public List<Token> tokenizeNER(String text) {
    List<Token> ret = new ArrayList<Token>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    try {
        // run all Annotators on this text
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println(
                "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): ");
        System.out.println("\t\t" + text);
        e.printStackTrace();
        return null;
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    int position = 0;
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        String name = "";

        String lastNe = "O";
        int startEntity = 0;

        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            String word = token.get(TextAnnotation.class);
            // this is the lemma of the token
            String lemma = token.get(LemmaAnnotation.class);
            // this is the POS tag of the token: disabled here since we do not need it now.
            String pos = token.get(PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(NamedEntityTagAnnotation.class);

            int bPos = token.beginPosition();
            int ePos = token.endPosition();

            // keep track of mentions
            if (lastNe.equals("O")) {
                if (!ne.equals("O")) {
                    startEntity = bPos;
                    name = word;
                }
            } else {
                if (ne.equals("O")) {
                    int endEntity = position;
                    //create mention.
                    Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity);
                    ret.add(tok);
                    //   System.out.println(tok);
                } else {
                    if (ne.equals(lastNe)) {
                        name += " " + word;
                    }
                }

                if (!ne.equals(lastNe) && !ne.equals("O")) {
                    int endEntity = position;
                    //create mention.
                    Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity);
                    ret.add(tok);
                    //   System.out.println(tok);

                    startEntity = bPos;
                    name = word;
                }
            }

            if (ne.equals("O")) {
                //filter out the punctuations and stop words.
                if (!word.equals(pos) && !StringUtils.isStopWord(word)) {
                    // create token.
                    Token tok = new Token(word, Token.TYPE.TOKEN, bPos, ePos);
                    ret.add(tok);
                    // System.out.println(tok);
                }
            }

            lastNe = ne;
            position = ePos;
        }

        // verify mention ending at the last token
        if (!lastNe.equals("O") && !lastNe.equals(".")) {
            int endEntity = position;
            //create mention.
            Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity);
            ret.add(tok);
        }
    }

    return ret;
}

From source file:ca.ualberta.entitylinking.common.indexing.Tokenizer.java

License:Open Source License

/**
 * This one does not use the NER./*from   ww  w  .j  a  v  a 2 s  .c o  m*/
 * @param text
 * @return
 */
public List<Token> tokenize(String text) {
    List<Token> ret = new ArrayList<Token>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    try {
        // run all Annotators on this text
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println(
                "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): ");
        System.out.println("\t\t" + text);
        e.printStackTrace();
        return null;
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            String lemma = token.get(LemmaAnnotation.class);

            int bPos = token.beginPosition();
            int ePos = token.endPosition();

            // filter out the punctuations and stop words.
            if (useful(lemma) && !StringUtils.isStopWord(lemma)) {
                // create token.
                Token tok = new Token(lemma, Token.TYPE.TOKEN, bPos, ePos);
                ret.add(tok);
            }
        }
    }

    return ret;
}

From source file:ca.ualberta.entitylinking.common.nlp.StanfordNER.java

License:Open Source License

public List<Sentence> annotateText(String text) {

    if (!(orlandoModel.isEmpty())) {
        try {/*from  ww  w  . ja va  2s. c o  m*/
            @SuppressWarnings("rawtypes")
            AbstractSequenceClassifier orlandoClassifier = CRFClassifier
                    .getClassifierNoExceptions(orlandoModel);
            text = orlandoClassifier.classifyWithInlineXML(text);
        } catch (Exception e) {
            System.err.println("[WARNING] Stanford NER was unable to classify the following: ");
            System.out.println("\t" + text + "\n");
            e.printStackTrace();
        }
    }

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    List<Sentence> mySentences = new ArrayList<Sentence>();

    try {
        // run all Annotators on this text
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println(
                "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): ");
        System.out.println("\t\t" + text);
        e.printStackTrace();
        return mySentences;
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {

        List<Token> tokens = new ArrayList<Token>();
        List<Mention> mentions = new ArrayList<Mention>();
        int position = 0;
        String name = "";

        String lastNe = "O";
        int startEntity = 0;

        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            String word = token.get(TextAnnotation.class);
            // this is the lemma of the token
            String lemma = token.get(LemmaAnnotation.class);
            // this is the POS tag of the token
            String pos = token.get(PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(NamedEntityTagAnnotation.class);
            // this is the token offset
            int bPos = token.beginPosition();
            int ePos = token.endPosition();

            Token mytoken = new Token(word, position, bPos, ePos - 1);
            mytoken.addAnnotation(Token.LEMMA_ANNOTATION, lemma);
            mytoken.addAnnotation(Token.POS_ANNOTATION, pos);
            mytoken.addAnnotation(Token.ENTITY_ANNOTATION, ne);
            tokens.add(mytoken);

            // keep track of mentions
            if (lastNe.equals("O")) {
                if (!ne.equals("O")) {
                    startEntity = position;
                    name = word;
                }
            } else {
                if (ne.equals("O")) {
                    int endEntity = position - 1;
                    createMention(name, lastNe, startEntity, endEntity, mentions);
                } else {
                    if (ne.equals(lastNe)) {
                        name += " " + word;
                    }
                }

                if (!ne.equals(lastNe) && !ne.equals("O")) {
                    int endEntity = position - 1;
                    createMention(name, lastNe, startEntity, endEntity, mentions);

                    startEntity = position;
                    name = word;
                }

            }

            //            System.out.println(word + "\t" + lemma + "\t" + pos + "\t" + ne);
            lastNe = ne;
            position++;

        }

        // verify mention ending at the last token
        if (!lastNe.equals("O") && !lastNe.equals(".")) {
            int endEntity = position - 1;
            createMention(name, lastNe, startEntity, endEntity, mentions);
        }

        Sentence mySentence = new Sentence(tokens);
        for (Mention mention : mentions) {
            mySentence.addMention(mention);

        }
        mySentences.add(mySentence);

    }

    return mySentences;
}

From source file:ca.ualberta.exemplar.core.ArgumentExtraction.java

License:Open Source License

private Argument getEntityFromHead(IndexedWord head, CoreMap sentence, SemanticGraph dependencies,
        String argumentType) {//from   ww w .ja v  a2s  .com

    int startIndex = head.index() - 1; //Changing from starting at 1 to starting at 0
    int endIndex = head.index() - 1;

    List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);

    CoreLabel token = tokens.get(startIndex);
    String ne = token.get(NamedEntityTagAnnotation.class);
    StringBuilder builder = new StringBuilder();
    builder.append(token.get(TextAnnotation.class));
    int startOffset = token.beginPosition();
    int endOffset = token.endPosition();

    // Look for first token of the entity.
    for (int index = startIndex - 1; index >= 0; index--) {

        token = tokens.get(index);
        String word = token.get(TextAnnotation.class);
        if (!ne.equals(token.get(NamedEntityTagAnnotation.class)))
            break;

        startIndex--;
        builder.insert(0, word + " ");
        startOffset = token.beginPosition();
    }

    for (int index = endIndex + 1; index < tokens.size(); index++) {

        token = tokens.get(index);
        String word = token.get(TextAnnotation.class);
        if (!ne.equals(token.get(NamedEntityTagAnnotation.class)))
            break;

        endIndex++;
        builder.append(" " + word);
        endOffset = token.endPosition();
    }

    String entityName = builder.toString();
    String entityType = normalizeEntityType(ne);
    String entityId = entityName + "#" + entityType;
    Argument argument = new Argument(argumentType, entityId, entityName, entityType, startIndex, endIndex,
            startOffset, endOffset);

    return argument;
}

From source file:de.l3s.workive.analysis.ner.GermanNER.java

public List<Entity> extractEntities(CoreMap sentence) {
    List<Entity> entityList = new ArrayList<Entity>();

    CoreLabel prevEntity = null;
    String tag = "";

    for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        String entityTag = token.get(NamedEntityTagAnnotation.class);

        //System.out.println(entityTag);
        if (entityTag.compareToIgnoreCase("I-ORG") == 0 || entityTag.compareToIgnoreCase("I-PER") == 0
                || entityTag.compareToIgnoreCase("I-LOC") == 0 || entityTag.compareToIgnoreCase("MISC") == 0) {

            if (prevEntity != null) {
                if (prevEntity.get(NamedEntityTagAnnotation.class).compareToIgnoreCase(entityTag) == 0
                        && prevEntity.endPosition() == token.beginPosition() - 1) {
                    prevEntity.setEndPosition(token.endPosition());
                    prevEntity.set(TextAnnotation.class,
                            prevEntity.get(TextAnnotation.class) + " " + token.get(TextAnnotation.class));
                } else {
                    Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(
                            prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(),
                            prevEntity.endPosition());
                    entityList.add(new Entity(triple, tag));
                    prevEntity = token;/*from w w w. jav  a2s  .c o m*/
                    tag = entityTag;

                }
            } else {
                prevEntity = token;
                tag = entityTag;
            }
        }
    }

    if (prevEntity != null) {

        Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(
                prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition());
        entityList.add(new Entity(triple, tag));
        tag = "";
    }
    return entityList;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java

License:Open Source License

@Override
public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()),
            new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
        replace(label.beginPosition(), label.endPosition(), label.word());
    }//from   w w  w  .  j  av a 2 s  .c  om
}

From source file:de.uni_leipzig.informatik.pcai042.boa.manager.BoaSentence.java

License:Open Source License

/**
 * Creates a sentence from a CoreMap returned by a {@link Tokenizer}.
 * /*  www . ja v  a2 s  .c o  m*/
 * @param sentence
 *            the original text of the sentence
 * @param coreMap
 *            the CoreMap
 */
public BoaSentence(CoreMap coreMap) {
    sentence = coreMap.get(CoreAnnotations.TextAnnotation.class);
    tokens = new ArrayList<String>(coreMap.get(TokensAnnotation.class).size());
    beginPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size());
    endPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size());
    for (CoreLabel token : coreMap.get(TokensAnnotation.class)) {
        String word = token.originalText();
        tokens.add(word);
        beginPos.add(token.beginPosition());
        endPos.add(token.endPosition());
    }
    annotations = new ArrayList<BoaAnnotation>();
}

From source file:de.uni_stuttgart.ims.comparatives.nlp.SentenceSplitterStanford.java

License:Creative Commons License

/**
 * Split the string into sentences with Stanford.
 * @return List of spans with the start/end positions of each sentence. 
 *///from  w w  w  . j a  v a  2  s  . c  o  m
public TextSpan[] split(String document) {
    StringReader reader = new StringReader(document);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    dp.setTokenizerFactory(ptbTokenizerFactory);

    ArrayList<TextSpan> sentenceSpansList = new ArrayList<TextSpan>();
    for (List<HasWord> sent : dp) {
        CoreLabel firstword = (CoreLabel) sent.get(0);
        CoreLabel lastword = (CoreLabel) sent.get(sent.size() - 1);
        String coveredText = "";
        for (int i = 0; i < sent.size(); i++) {
            CoreLabel word = (CoreLabel) sent.get(i);
            coveredText += word.value() + " ";
        }
        sentenceSpansList.add(new TextSpan(firstword.beginPosition(), lastword.endPosition(), coveredText));
    }

    return sentenceSpansList.toArray(new TextSpan[0]);

}

From source file:dfh.grammar.stanfordnlp.CnlpTokenSequenceFactory.java

License:LGPL

/**
 * Converts an annotated document into a token sequence.
 * //from  ww  w  .ja  v a 2s  .  c  o m
 * @param document
 * @return token sequence
 */
public TokenSequence<CnlpToken<?>> sequence(Annotation document) {
    String text = document.get(TextAnnotation.class);
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and
    // has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    List<CnlpToken<?>> tokens = new LinkedList<CnlpToken<?>>();
    for (CoreMap sentence : sentences) {
        Integer sstart = null, send = null;
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            tokens.add(new WordToken(token));
            if (sstart == null)
                sstart = token.beginPosition();
            send = token.endPosition();
        }
        if (sstart != null)
            tokens.add(new SentenceToken(sstart, send, sentence));
    }
    return new TokenSequence<CnlpToken<?>>(text, tokens);
}

From source file:dfh.grammar.stanfordnlp.WordToken.java

License:LGPL

public WordToken(CoreLabel t) {
    super(t.beginPosition(), t.endPosition(), t);
    tag = t.get(PartOfSpeechAnnotation.class);
}