Example usage for edu.stanford.nlp.ling CoreLabel beginPosition

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel beginPosition.

Prototype

@Override
public int beginPosition()

Source Link

Usage

From source file:ca.ualberta.entitylinking.common.indexing.Tokenizer.java

License:Open Source License

/**
 * Tokenize the given text using a Stanford NLP toolkit.
 * The tokenized results are a mix of non-overlapping tokens and named entities.
 * e.g. University of Alberta is a great university in Canada.
 * tokens: "University of Alberta", "is", "a", "great", "university", "in", "Canada".
 *
 * @param text// ww  w.  ja va 2 s  . c o m
 * @return
 */
public List<Token> tokenizeNER(String text) {
    List<Token> ret = new ArrayList<Token>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    try {
        // run all Annotators on this text
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println(
                "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): ");
        System.out.println("\t\t" + text);
        e.printStackTrace();
        return null;
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    int position = 0;
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        String name = "";

        String lastNe = "O";
        int startEntity = 0;

        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            String word = token.get(TextAnnotation.class);
            // this is the lemma of the token
            String lemma = token.get(LemmaAnnotation.class);
            // this is the POS tag of the token: disabled here since we do not need it now.
            String pos = token.get(PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(NamedEntityTagAnnotation.class);

            int bPos = token.beginPosition();
            int ePos = token.endPosition();

            // keep track of mentions
            if (lastNe.equals("O")) {
                if (!ne.equals("O")) {
                    startEntity = bPos;
                    name = word;
                }
            } else {
                if (ne.equals("O")) {
                    int endEntity = position;
                    //create mention.
                    Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity);
                    ret.add(tok);
                    //   System.out.println(tok);
                } else {
                    if (ne.equals(lastNe)) {
                        name += " " + word;
                    }
                }

                if (!ne.equals(lastNe) && !ne.equals("O")) {
                    int endEntity = position;
                    //create mention.
                    Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity);
                    ret.add(tok);
                    //   System.out.println(tok);

                    startEntity = bPos;
                    name = word;
                }
            }

            if (ne.equals("O")) {
                //filter out the punctuations and stop words.
                if (!word.equals(pos) && !StringUtils.isStopWord(word)) {
                    // create token.
                    Token tok = new Token(word, Token.TYPE.TOKEN, bPos, ePos);
                    ret.add(tok);
                    // System.out.println(tok);
                }
            }

            lastNe = ne;
            position = ePos;
        }

        // verify mention ending at the last token
        if (!lastNe.equals("O") && !lastNe.equals(".")) {
            int endEntity = position;
            //create mention.
            Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity);
            ret.add(tok);
        }
    }

    return ret;
}

From source file:ca.ualberta.entitylinking.common.indexing.Tokenizer.java

License:Open Source License

/**
 * This one does not use the NER./*from w  ww. ja  v  a  2  s.  c  om*/
 * @param text
 * @return
 */
public List<Token> tokenize(String text) {
    List<Token> ret = new ArrayList<Token>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    try {
        // run all Annotators on this text
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println(
                "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): ");
        System.out.println("\t\t" + text);
        e.printStackTrace();
        return null;
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            String lemma = token.get(LemmaAnnotation.class);

            int bPos = token.beginPosition();
            int ePos = token.endPosition();

            // filter out the punctuations and stop words.
            if (useful(lemma) && !StringUtils.isStopWord(lemma)) {
                // create token.
                Token tok = new Token(lemma, Token.TYPE.TOKEN, bPos, ePos);
                ret.add(tok);
            }
        }
    }

    return ret;
}

From source file:ca.ualberta.entitylinking.common.nlp.StanfordNER.java

License:Open Source License

public List<Sentence> annotateText(String text) {

    if (!(orlandoModel.isEmpty())) {
        try {/*from   w w  w . j  a  v a2s .c  o  m*/
            @SuppressWarnings("rawtypes")
            AbstractSequenceClassifier orlandoClassifier = CRFClassifier
                    .getClassifierNoExceptions(orlandoModel);
            text = orlandoClassifier.classifyWithInlineXML(text);
        } catch (Exception e) {
            System.err.println("[WARNING] Stanford NER was unable to classify the following: ");
            System.out.println("\t" + text + "\n");
            e.printStackTrace();
        }
    }

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);
    List<Sentence> mySentences = new ArrayList<Sentence>();

    try {
        // run all Annotators on this text
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println(
                "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): ");
        System.out.println("\t\t" + text);
        e.printStackTrace();
        return mySentences;
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {

        List<Token> tokens = new ArrayList<Token>();
        List<Mention> mentions = new ArrayList<Mention>();
        int position = 0;
        String name = "";

        String lastNe = "O";
        int startEntity = 0;

        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            String word = token.get(TextAnnotation.class);
            // this is the lemma of the token
            String lemma = token.get(LemmaAnnotation.class);
            // this is the POS tag of the token
            String pos = token.get(PartOfSpeechAnnotation.class);
            // this is the NER label of the token
            String ne = token.get(NamedEntityTagAnnotation.class);
            // this is the token offset
            int bPos = token.beginPosition();
            int ePos = token.endPosition();

            Token mytoken = new Token(word, position, bPos, ePos - 1);
            mytoken.addAnnotation(Token.LEMMA_ANNOTATION, lemma);
            mytoken.addAnnotation(Token.POS_ANNOTATION, pos);
            mytoken.addAnnotation(Token.ENTITY_ANNOTATION, ne);
            tokens.add(mytoken);

            // keep track of mentions
            if (lastNe.equals("O")) {
                if (!ne.equals("O")) {
                    startEntity = position;
                    name = word;
                }
            } else {
                if (ne.equals("O")) {
                    int endEntity = position - 1;
                    createMention(name, lastNe, startEntity, endEntity, mentions);
                } else {
                    if (ne.equals(lastNe)) {
                        name += " " + word;
                    }
                }

                if (!ne.equals(lastNe) && !ne.equals("O")) {
                    int endEntity = position - 1;
                    createMention(name, lastNe, startEntity, endEntity, mentions);

                    startEntity = position;
                    name = word;
                }

            }

            //            System.out.println(word + "\t" + lemma + "\t" + pos + "\t" + ne);
            lastNe = ne;
            position++;

        }

        // verify mention ending at the last token
        if (!lastNe.equals("O") && !lastNe.equals(".")) {
            int endEntity = position - 1;
            createMention(name, lastNe, startEntity, endEntity, mentions);
        }

        Sentence mySentence = new Sentence(tokens);
        for (Mention mention : mentions) {
            mySentence.addMention(mention);

        }
        mySentences.add(mySentence);

    }

    return mySentences;
}

From source file:ca.ualberta.exemplar.core.ArgumentExtraction.java

License:Open Source License

private Argument getEntityFromHead(IndexedWord head, CoreMap sentence, SemanticGraph dependencies,
        String argumentType) {//from  ww  w .j a  va  2s  .  co m

    int startIndex = head.index() - 1; //Changing from starting at 1 to starting at 0
    int endIndex = head.index() - 1;

    List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);

    CoreLabel token = tokens.get(startIndex);
    String ne = token.get(NamedEntityTagAnnotation.class);
    StringBuilder builder = new StringBuilder();
    builder.append(token.get(TextAnnotation.class));
    int startOffset = token.beginPosition();
    int endOffset = token.endPosition();

    // Look for first token of the entity.
    for (int index = startIndex - 1; index >= 0; index--) {

        token = tokens.get(index);
        String word = token.get(TextAnnotation.class);
        if (!ne.equals(token.get(NamedEntityTagAnnotation.class)))
            break;

        startIndex--;
        builder.insert(0, word + " ");
        startOffset = token.beginPosition();
    }

    for (int index = endIndex + 1; index < tokens.size(); index++) {

        token = tokens.get(index);
        String word = token.get(TextAnnotation.class);
        if (!ne.equals(token.get(NamedEntityTagAnnotation.class)))
            break;

        endIndex++;
        builder.append(" " + word);
        endOffset = token.endPosition();
    }

    String entityName = builder.toString();
    String entityType = normalizeEntityType(ne);
    String entityId = entityName + "#" + entityType;
    Argument argument = new Argument(argumentType, entityId, entityName, entityType, startIndex, endIndex,
            startOffset, endOffset);

    return argument;
}

From source file:ca.ualberta.exemplar.core.CleanPrefixAnnotator.java

License:Open Source License

@Override
public void annotate(Annotation document) {
    if (document.has(SentencesAnnotation.class)) {
        for (CoreMap sentence : document.get(SentencesAnnotation.class)) {

            List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
            int numTokens = 0, numPrefixParts = 0;

            // Assumption: prefix is at max 10 tokens
            for (int i = 0; i < Math.min(tokens.size(), 10); i++) {

                CoreLabel token = tokens.get(i);
                String tokenText = token.get(TextAnnotation.class);

                if (tokenText != null && numTokens > 0 && (tokenText.equals("--") || tokenText.equals(":"))) {
                    // Assumption: if more than half the tokens are a date/location/number it's a prefix
                    double fraction = (double) numPrefixParts / (double) numTokens;
                    if (fraction > 0.5) {
                        CoreLabel nextToken = tokens.get(i + 1);
                        String before = document.get(TextAnnotation.class).substring(0,
                                nextToken.beginPosition());
                        nextToken.set(BeforeAnnotation.class, before);
                        sentence.set(TokensAnnotation.class, tokens.subList(i + 1, tokens.size()));
                        //System.out.println("Removed Prefix: " + before);
                    }//  ww  w  . java  2 s .  co  m
                    break;
                }

                numTokens++;
                String neTag = token.ner();
                if (neTag != null && (neTag.equals("DATE") || neTag.equals("LOCATION") || neTag.equals("NUMBER")
                        || neTag.equals("ORDINAL"))) {
                    numPrefixParts++;
                }
            }
        }
    }
}

From source file:de.l3s.workive.analysis.ner.GermanNER.java

public List<Entity> extractEntities(CoreMap sentence) {
    List<Entity> entityList = new ArrayList<Entity>();

    CoreLabel prevEntity = null;//from www.ja  va  2s  .c  o  m
    String tag = "";

    for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        String entityTag = token.get(NamedEntityTagAnnotation.class);

        //System.out.println(entityTag);
        if (entityTag.compareToIgnoreCase("I-ORG") == 0 || entityTag.compareToIgnoreCase("I-PER") == 0
                || entityTag.compareToIgnoreCase("I-LOC") == 0 || entityTag.compareToIgnoreCase("MISC") == 0) {

            if (prevEntity != null) {
                if (prevEntity.get(NamedEntityTagAnnotation.class).compareToIgnoreCase(entityTag) == 0
                        && prevEntity.endPosition() == token.beginPosition() - 1) {
                    prevEntity.setEndPosition(token.endPosition());
                    prevEntity.set(TextAnnotation.class,
                            prevEntity.get(TextAnnotation.class) + " " + token.get(TextAnnotation.class));
                } else {
                    Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(
                            prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(),
                            prevEntity.endPosition());
                    entityList.add(new Entity(triple, tag));
                    prevEntity = token;
                    tag = entityTag;

                }
            } else {
                prevEntity = token;
                tag = entityTag;
            }
        }
    }

    if (prevEntity != null) {

        Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(
                prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition());
        entityList.add(new Entity(triple, tag));
        tag = "";
    }
    return entityList;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java

License:Open Source License

@Override
public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()),
            new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
        replace(label.beginPosition(), label.endPosition(), label.word());
    }/* www.j  ava2  s. c  o  m*/
}

From source file:de.uni_leipzig.informatik.pcai042.boa.manager.BoaSentence.java

License:Open Source License

/**
 * Creates a sentence from a CoreMap returned by a {@link Tokenizer}.
 * //from   w  w  w .j  a v  a2  s  .  c  o m
 * @param sentence
 *            the original text of the sentence
 * @param coreMap
 *            the CoreMap
 */
public BoaSentence(CoreMap coreMap) {
    sentence = coreMap.get(CoreAnnotations.TextAnnotation.class);
    tokens = new ArrayList<String>(coreMap.get(TokensAnnotation.class).size());
    beginPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size());
    endPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size());
    for (CoreLabel token : coreMap.get(TokensAnnotation.class)) {
        String word = token.originalText();
        tokens.add(word);
        beginPos.add(token.beginPosition());
        endPos.add(token.endPosition());
    }
    annotations = new ArrayList<BoaAnnotation>();
}

From source file:de.uni_stuttgart.ims.comparatives.nlp.SentenceSplitterStanford.java

License:Creative Commons License

/**
 * Split the string into sentences with Stanford.
 * @return List of spans with the start/end positions of each sentence. 
 *///from www.ja va 2  s . c  om
public TextSpan[] split(String document) {
    StringReader reader = new StringReader(document);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    dp.setTokenizerFactory(ptbTokenizerFactory);

    ArrayList<TextSpan> sentenceSpansList = new ArrayList<TextSpan>();
    for (List<HasWord> sent : dp) {
        CoreLabel firstword = (CoreLabel) sent.get(0);
        CoreLabel lastword = (CoreLabel) sent.get(sent.size() - 1);
        String coveredText = "";
        for (int i = 0; i < sent.size(); i++) {
            CoreLabel word = (CoreLabel) sent.get(i);
            coveredText += word.value() + " ";
        }
        sentenceSpansList.add(new TextSpan(firstword.beginPosition(), lastword.endPosition(), coveredText));
    }

    return sentenceSpansList.toArray(new TextSpan[0]);

}

From source file:dfh.grammar.stanfordnlp.CnlpTokenSequenceFactory.java

License:LGPL

/**
 * Converts an annotated document into a token sequence.
 * /* w w  w .j a v a 2 s  .  c o  m*/
 * @param document
 * @return token sequence
 */
public TokenSequence<CnlpToken<?>> sequence(Annotation document) {
    String text = document.get(TextAnnotation.class);
    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and
    // has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    List<CnlpToken<?>> tokens = new LinkedList<CnlpToken<?>>();
    for (CoreMap sentence : sentences) {
        Integer sstart = null, send = null;
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            tokens.add(new WordToken(token));
            if (sstart == null)
                sstart = token.beginPosition();
            send = token.endPosition();
        }
        if (sstart != null)
            tokens.add(new SentenceToken(sstart, send, sentence));
    }
    return new TokenSequence<CnlpToken<?>>(text, tokens);
}