List of usage examples for edu.stanford.nlp.ling CoreLabel beginPosition
@Override public int beginPosition()
From source file:ca.ualberta.entitylinking.common.indexing.Tokenizer.java
License:Open Source License
/** * Tokenize the given text using a Stanford NLP toolkit. * The tokenized results are a mix of non-overlapping tokens and named entities. * e.g. University of Alberta is a great university in Canada. * tokens: "University of Alberta", "is", "a", "great", "university", "in", "Canada". * * @param text// ww w. ja va 2 s . c o m * @return */ public List<Token> tokenizeNER(String text) { List<Token> ret = new ArrayList<Token>(); // create an empty Annotation just with the given text Annotation document = new Annotation(text); try { // run all Annotators on this text pipeline.annotate(document); } catch (Exception e) { System.out.println( "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): "); System.out.println("\t\t" + text); e.printStackTrace(); return null; } // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types int position = 0; List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { String name = ""; String lastNe = "O"; int startEntity = 0; // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = token.get(TextAnnotation.class); // this is the lemma of the token String lemma = token.get(LemmaAnnotation.class); // this is the POS tag of the token: disabled here since we do not need it now. String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); int bPos = token.beginPosition(); int ePos = token.endPosition(); // keep track of mentions if (lastNe.equals("O")) { if (!ne.equals("O")) { startEntity = bPos; name = word; } } else { if (ne.equals("O")) { int endEntity = position; //create mention. Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity); ret.add(tok); // System.out.println(tok); } else { if (ne.equals(lastNe)) { name += " " + word; } } if (!ne.equals(lastNe) && !ne.equals("O")) { int endEntity = position; //create mention. Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity); ret.add(tok); // System.out.println(tok); startEntity = bPos; name = word; } } if (ne.equals("O")) { //filter out the punctuations and stop words. if (!word.equals(pos) && !StringUtils.isStopWord(word)) { // create token. Token tok = new Token(word, Token.TYPE.TOKEN, bPos, ePos); ret.add(tok); // System.out.println(tok); } } lastNe = ne; position = ePos; } // verify mention ending at the last token if (!lastNe.equals("O") && !lastNe.equals(".")) { int endEntity = position; //create mention. Token tok = new Token(name, Token.TYPE.NE, startEntity, endEntity); ret.add(tok); } } return ret; }
From source file:ca.ualberta.entitylinking.common.indexing.Tokenizer.java
License:Open Source License
/** * This one does not use the NER./*from w ww. ja v a 2 s. c om*/ * @param text * @return */ public List<Token> tokenize(String text) { List<Token> ret = new ArrayList<Token>(); // create an empty Annotation just with the given text Annotation document = new Annotation(text); try { // run all Annotators on this text pipeline.annotate(document); } catch (Exception e) { System.out.println( "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): "); System.out.println("\t\t" + text); e.printStackTrace(); return null; } // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token String lemma = token.get(LemmaAnnotation.class); int bPos = token.beginPosition(); int ePos = token.endPosition(); // filter out the punctuations and stop words. if (useful(lemma) && !StringUtils.isStopWord(lemma)) { // create token. Token tok = new Token(lemma, Token.TYPE.TOKEN, bPos, ePos); ret.add(tok); } } } return ret; }
From source file:ca.ualberta.entitylinking.common.nlp.StanfordNER.java
License:Open Source License
public List<Sentence> annotateText(String text) { if (!(orlandoModel.isEmpty())) { try {/*from w w w . j a v a2s .c o m*/ @SuppressWarnings("rawtypes") AbstractSequenceClassifier orlandoClassifier = CRFClassifier .getClassifierNoExceptions(orlandoModel); text = orlandoClassifier.classifyWithInlineXML(text); } catch (Exception e) { System.err.println("[WARNING] Stanford NER was unable to classify the following: "); System.out.println("\t" + text + "\n"); e.printStackTrace(); } } // create an empty Annotation just with the given text Annotation document = new Annotation(text); List<Sentence> mySentences = new ArrayList<Sentence>(); try { // run all Annotators on this text pipeline.annotate(document); } catch (Exception e) { System.out.println( "[WARNING] Stanford NER was unable to annotate the following text (more details in the stack trace): "); System.out.println("\t\t" + text); e.printStackTrace(); return mySentences; } // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<Token> tokens = new ArrayList<Token>(); List<Mention> mentions = new ArrayList<Mention>(); int position = 0; String name = ""; String lastNe = "O"; int startEntity = 0; // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = token.get(TextAnnotation.class); // this is the lemma of the token String lemma = token.get(LemmaAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); // this is the token offset int bPos = token.beginPosition(); int ePos = token.endPosition(); Token mytoken = new Token(word, position, bPos, ePos - 1); mytoken.addAnnotation(Token.LEMMA_ANNOTATION, lemma); mytoken.addAnnotation(Token.POS_ANNOTATION, pos); mytoken.addAnnotation(Token.ENTITY_ANNOTATION, ne); tokens.add(mytoken); // keep track of mentions if (lastNe.equals("O")) { if (!ne.equals("O")) { startEntity = position; name = word; } } else { if (ne.equals("O")) { int endEntity = position - 1; createMention(name, lastNe, startEntity, endEntity, mentions); } else { if (ne.equals(lastNe)) { name += " " + word; } } if (!ne.equals(lastNe) && !ne.equals("O")) { int endEntity = position - 1; createMention(name, lastNe, startEntity, endEntity, mentions); startEntity = position; name = word; } } // System.out.println(word + "\t" + lemma + "\t" + pos + "\t" + ne); lastNe = ne; position++; } // verify mention ending at the last token if (!lastNe.equals("O") && !lastNe.equals(".")) { int endEntity = position - 1; createMention(name, lastNe, startEntity, endEntity, mentions); } Sentence mySentence = new Sentence(tokens); for (Mention mention : mentions) { mySentence.addMention(mention); } mySentences.add(mySentence); } return mySentences; }
From source file:ca.ualberta.exemplar.core.ArgumentExtraction.java
License:Open Source License
private Argument getEntityFromHead(IndexedWord head, CoreMap sentence, SemanticGraph dependencies, String argumentType) {//from ww w .j a va 2s . co m int startIndex = head.index() - 1; //Changing from starting at 1 to starting at 0 int endIndex = head.index() - 1; List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); CoreLabel token = tokens.get(startIndex); String ne = token.get(NamedEntityTagAnnotation.class); StringBuilder builder = new StringBuilder(); builder.append(token.get(TextAnnotation.class)); int startOffset = token.beginPosition(); int endOffset = token.endPosition(); // Look for first token of the entity. for (int index = startIndex - 1; index >= 0; index--) { token = tokens.get(index); String word = token.get(TextAnnotation.class); if (!ne.equals(token.get(NamedEntityTagAnnotation.class))) break; startIndex--; builder.insert(0, word + " "); startOffset = token.beginPosition(); } for (int index = endIndex + 1; index < tokens.size(); index++) { token = tokens.get(index); String word = token.get(TextAnnotation.class); if (!ne.equals(token.get(NamedEntityTagAnnotation.class))) break; endIndex++; builder.append(" " + word); endOffset = token.endPosition(); } String entityName = builder.toString(); String entityType = normalizeEntityType(ne); String entityId = entityName + "#" + entityType; Argument argument = new Argument(argumentType, entityId, entityName, entityType, startIndex, endIndex, startOffset, endOffset); return argument; }
From source file:ca.ualberta.exemplar.core.CleanPrefixAnnotator.java
License:Open Source License
@Override public void annotate(Annotation document) { if (document.has(SentencesAnnotation.class)) { for (CoreMap sentence : document.get(SentencesAnnotation.class)) { List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); int numTokens = 0, numPrefixParts = 0; // Assumption: prefix is at max 10 tokens for (int i = 0; i < Math.min(tokens.size(), 10); i++) { CoreLabel token = tokens.get(i); String tokenText = token.get(TextAnnotation.class); if (tokenText != null && numTokens > 0 && (tokenText.equals("--") || tokenText.equals(":"))) { // Assumption: if more than half the tokens are a date/location/number it's a prefix double fraction = (double) numPrefixParts / (double) numTokens; if (fraction > 0.5) { CoreLabel nextToken = tokens.get(i + 1); String before = document.get(TextAnnotation.class).substring(0, nextToken.beginPosition()); nextToken.set(BeforeAnnotation.class, before); sentence.set(TokensAnnotation.class, tokens.subList(i + 1, tokens.size())); //System.out.println("Removed Prefix: " + before); }// ww w . java 2 s . co m break; } numTokens++; String neTag = token.ner(); if (neTag != null && (neTag.equals("DATE") || neTag.equals("LOCATION") || neTag.equals("NUMBER") || neTag.equals("ORDINAL"))) { numPrefixParts++; } } } } }
From source file:de.l3s.workive.analysis.ner.GermanNER.java
public List<Entity> extractEntities(CoreMap sentence) { List<Entity> entityList = new ArrayList<Entity>(); CoreLabel prevEntity = null;//from www.ja va 2s .c o m String tag = ""; for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String entityTag = token.get(NamedEntityTagAnnotation.class); //System.out.println(entityTag); if (entityTag.compareToIgnoreCase("I-ORG") == 0 || entityTag.compareToIgnoreCase("I-PER") == 0 || entityTag.compareToIgnoreCase("I-LOC") == 0 || entityTag.compareToIgnoreCase("MISC") == 0) { if (prevEntity != null) { if (prevEntity.get(NamedEntityTagAnnotation.class).compareToIgnoreCase(entityTag) == 0 && prevEntity.endPosition() == token.beginPosition() - 1) { prevEntity.setEndPosition(token.endPosition()); prevEntity.set(TextAnnotation.class, prevEntity.get(TextAnnotation.class) + " " + token.get(TextAnnotation.class)); } else { Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>( prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition()); entityList.add(new Entity(triple, tag)); prevEntity = token; tag = entityTag; } } else { prevEntity = token; tag = entityTag; } } } if (prevEntity != null) { Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>( prevEntity.get(TextAnnotation.class), prevEntity.beginPosition(), prevEntity.endPosition()); entityList.add(new Entity(triple, tag)); tag = ""; } return entityList; }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java
License:Open Source License
@Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible"); for (CoreLabel label : tokenizer.tokenize()) { replace(label.beginPosition(), label.endPosition(), label.word()); }/* www.j ava2 s. c o m*/ }
From source file:de.uni_leipzig.informatik.pcai042.boa.manager.BoaSentence.java
License:Open Source License
/** * Creates a sentence from a CoreMap returned by a {@link Tokenizer}. * //from w w w .j a v a2 s . c o m * @param sentence * the original text of the sentence * @param coreMap * the CoreMap */ public BoaSentence(CoreMap coreMap) { sentence = coreMap.get(CoreAnnotations.TextAnnotation.class); tokens = new ArrayList<String>(coreMap.get(TokensAnnotation.class).size()); beginPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size()); endPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size()); for (CoreLabel token : coreMap.get(TokensAnnotation.class)) { String word = token.originalText(); tokens.add(word); beginPos.add(token.beginPosition()); endPos.add(token.endPosition()); } annotations = new ArrayList<BoaAnnotation>(); }
From source file:de.uni_stuttgart.ims.comparatives.nlp.SentenceSplitterStanford.java
License:Creative Commons License
/** * Split the string into sentences with Stanford. * @return List of spans with the start/end positions of each sentence. *///from www.ja va 2 s . c om public TextSpan[] split(String document) { StringReader reader = new StringReader(document); DocumentPreprocessor dp = new DocumentPreprocessor(reader); dp.setTokenizerFactory(ptbTokenizerFactory); ArrayList<TextSpan> sentenceSpansList = new ArrayList<TextSpan>(); for (List<HasWord> sent : dp) { CoreLabel firstword = (CoreLabel) sent.get(0); CoreLabel lastword = (CoreLabel) sent.get(sent.size() - 1); String coveredText = ""; for (int i = 0; i < sent.size(); i++) { CoreLabel word = (CoreLabel) sent.get(i); coveredText += word.value() + " "; } sentenceSpansList.add(new TextSpan(firstword.beginPosition(), lastword.endPosition(), coveredText)); } return sentenceSpansList.toArray(new TextSpan[0]); }
From source file:dfh.grammar.stanfordnlp.CnlpTokenSequenceFactory.java
License:LGPL
/** * Converts an annotated document into a token sequence. * /* w w w .j a v a 2 s . c o m*/ * @param document * @return token sequence */ public TokenSequence<CnlpToken<?>> sequence(Annotation document) { String text = document.get(TextAnnotation.class); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and // has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); List<CnlpToken<?>> tokens = new LinkedList<CnlpToken<?>>(); for (CoreMap sentence : sentences) { Integer sstart = null, send = null; // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { tokens.add(new WordToken(token)); if (sstart == null) sstart = token.beginPosition(); send = token.endPosition(); } if (sstart != null) tokens.add(new SentenceToken(sstart, send, sentence)); } return new TokenSequence<CnlpToken<?>>(text, tokens); }