List of usage examples for edu.stanford.nlp.ling CoreLabel originalText
@Override
public String originalText()
From source file:com.project.NLP.Requirement.ParserTreeGenerator.java
/** * nameEntityAnnotation for track the Location and Person name * Return the word if the tokens contains Location, person, organization,misc, time, money, percent, date * // w w w .j a v a 2 s . co m * @return arrayList */ public ArrayList generateNamedEntityTagAnnotation() { sentences = document.get(SentencesAnnotation.class); ArrayList nameEntity = new ArrayList(); String annotations = ""; for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { // this is the NER label of the token annotations = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); if (annotations.equals("LOCATION") || annotations.equals("PERSON") || annotations.equals("ORGANIZATION") || annotations.equals("MISC") || annotations.equals("TIME") || annotations.equals("MONEY") || annotations.equals("PERCENT") || annotations.equals("DATE")) { nameEntity.add(token.originalText()); } } } return nameEntity; }
From source file:de.uni_leipzig.informatik.pcai042.boa.gui.goldstandard.BoaSentence.java
License:Open Source License
/** * Tokenizes the sentence and generates an XML representation. * // w w w . j a v a 2 s . c om * @param sentence * @throws IllegalArgumentException * thrown when StanfordCoreNLP couldn't tokenize the sentence * properly */ public BoaSentence(String sentence) throws IllegalArgumentException { this.sentence = sentence; annotations = new ArrayList<BoaAnnotation>(); tokens = new ArrayList<String>(); // double checked locking if (pipeline == null) initPipeline(); synchronized (pipeline) { // generate tokens with StanfordCoreNLP Annotation document = new Annotation(sentence); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); // since we always deal with single sentences, there can be only one // sentence in the output // otherwise StanfordCoreNLP had problems to tokenize the sentence if (sentences.size() != 1) { throw new IllegalArgumentException(); } for (CoreLabel token : sentences.get(0).get(TokensAnnotation.class)) { // this is the text of the token String word = token.originalText(); tokens.add(word); } xmlDoc = pipeline.annotationToDoc(document); } }
From source file:de.uni_leipzig.informatik.pcai042.boa.manager.BoaSentence.java
License:Open Source License
/** * Creates a sentence from a CoreMap returned by a {@link Tokenizer}. * /*from w ww . j a v a2 s .c o m*/ * @param sentence * the original text of the sentence * @param coreMap * the CoreMap */ public BoaSentence(CoreMap coreMap) { sentence = coreMap.get(CoreAnnotations.TextAnnotation.class); tokens = new ArrayList<String>(coreMap.get(TokensAnnotation.class).size()); beginPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size()); endPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size()); for (CoreLabel token : coreMap.get(TokensAnnotation.class)) { String word = token.originalText(); tokens.add(word); beginPos.add(token.beginPosition()); endPos.add(token.endPosition()); } annotations = new ArrayList<BoaAnnotation>(); }
From source file:eu.fbk.dh.tint.tokenizer.ItalianTokenizer.java
License:Apache License
public static void main(String argv[]) throws IOException { ItalianTokenizer tokenizer = new ItalianTokenizer(); // byte[] file = Files.readAllBytes((new File("/Users/alessio/Desktop/milano.txt")).toPath()); // String text = new String(file); String text = "Clinton in testa nei sondaggi dopo lassoluzione dellFbi sulluso di un server di posta privato quando era Segretario di stato."; // text = "``Determinato, pronto a fare tutto il necessario per mantenere la stabilit dei prezzi.''" // + " Ma anche allarmato per come le conseguenze del referendum britannico minacciano leconomia e i mercati europei." // + " Sono nato nel 200 S.p.A." // + " Il mio indirizzo e-mail alessio@apnetwork.it." // + " Il blog http://www.ziorufus.it e mi piace molto."; // text = "Questo un test per una sigla qualsiasi tipo a.B.C. che non ha senso."; // text = "Milano (/milano/ ascolta[?info], in milanese: Milan[4], /mil?/[5]) una citt italiana di 1 346 153 abitanti[2], capoluogo dell'omonima citt metropolitana e della regione Lombardia, secondo comune italiano per numero di abitanti, tredicesimo comune dell'Unione europea e diciannovesimo del continente e, con l'agglomerato urbano, quarta area metropolitana pi popolata d'Europa dopo Londra, Madrid e Parigi[6].\n" // + "\n" // + "Fondata dagli Insubri all'inizio del VI secolo a.C.[7], fu conquistata dai Romani nel 222 a.C."; // System.out.println(text); long time = System.currentTimeMillis(); List<List<CoreLabel>> sentences = tokenizer.parse(text); time = System.currentTimeMillis() - time; for (int i = 0; i < Math.min(10, sentences.size()); i++) { List<CoreLabel> sentence = sentences.get(i); for (CoreLabel token : sentence) { System.out.println(token.word() + " -- " + token.originalText() + " -- " + token.beginPosition()); }//w w w . ja v a 2s .c om System.out.println(); } int sentenceSize = sentences.size(); int lastTokenIndex = sentences.get(sentenceSize - 1).get(sentences.get(sentenceSize - 1).size() - 1) .index(); System.out.println("Length: " + text.length()); System.out.println("Time: " + time); System.out.println("Sentences: " + sentenceSize); System.out.println("Tokens: " + lastTokenIndex); }
From source file:eu.modelwriter.semantic.stanford_corenlp.MorphologySimilarityProvider.java
License:Open Source License
/** * {@inheritDoc}// w w w . j a v a 2 s . c om * * @see eu.modelwriter.semantic.ISemanticSimilarityProvider#getSemanticSimilarities(java.util.Map) */ public Map<String, Set<Object>> getSemanticSimilarities(Map<String, Set<Object>> labels) { final Map<String, Set<Object>> res = new LinkedHashMap<String, Set<Object>>(); final StringBuilder builder = new StringBuilder(); final Set<String> words = labels.keySet(); if (!words.isEmpty()) { for (String label : words) { builder.append(label); builder.append(' '); } Annotation document = new Annotation(builder.substring(0, builder.length() - 1)); PIPELINE.annotate(document); final List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { final String label = token.originalText(); final String lemma = token.lemma(); Set<Object> lemmaSet = res.get(lemma); if (lemmaSet == null) { lemmaSet = new LinkedHashSet<Object>(); res.put(lemma, lemmaSet); } final Set<Object> concepts = labels.get(label); if (concepts != null) { lemmaSet.addAll(concepts); } } } } return res; }
From source file:ims.cs.corenlp.TokenAligner.java
License:Open Source License
/** * Splits a CoreNLP token based on a position. We split only the word form as we don't have sufficient information * to split the lemma./*from www . j a v a 2s . c o m*/ * @param token * @param absPosition * @return */ private CoreLabel[] splitToken(CoreLabel token, int absPosition) { String word = token.word(); String origText = token.originalText(); // initialize parts CoreLabel[] splitting = new CoreLabel[2]; splitting[0] = new CoreLabel(token); splitting[1] = new CoreLabel(token); // calculate split position int relPosition = absPosition - token.beginPosition(); // cut up original text if (origText.length() >= relPosition) { String origText1 = origText.substring(0, relPosition); String origText2 = origText.substring(relPosition); splitting[0].setOriginalText(origText1); splitting[1].setOriginalText(origText2); } // cut up predicted text if (word.length() >= relPosition) { String word1 = word.substring(0, relPosition); String word2 = word.substring(relPosition); splitting[0].setWord(word1); splitting[1].setWord(word2); } // we could do the same with POS and lemma, but that would be complicated ... splitting[0].setEndPosition(absPosition); /* set a new end as we just shortened this token */ splitting[1].setBeginPosition(absPosition); /* set a new position as we just shortened this token */ // copy lemmas splitting[0].setLemma(token.lemma()); splitting[1].setLemma(token.lemma()); return splitting; }
From source file:it.uniroma2.sag.kelp.input.parser.impl.StanfordParserWrapper.java
License:Apache License
@Override public DependencyGraph parse(String sentenceString) { Annotation document = new Annotation(sentenceString); pipeline.annotate(document);// ww w . j a v a 2 s. co m List<CoreMap> sentences = document.get(SentencesAnnotation.class); CoreMap sentence = sentences.get(0); DependencyGraph graph = new DependencyGraph(); graph.setSentence(sentenceString); graph.setParserName("StanfordParser"); graph.setParserVersion("3.6.0"); graph.setNodes(new ArrayList<DGNode>()); int nId = 1; for (CoreLabel token : sentence.get(TokensAnnotation.class)) { DGNode node = new DGNode(); Map<String, Object> nodeProps = new HashMap<String, Object>(); nodeProps.put("surface", token.originalText()); nodeProps.put("lemma", token.lemma()); nodeProps.put("pos", token.tag()); nodeProps.put("start", token.beginPosition()); nodeProps.put("end", token.endPosition()); nodeProps.put("id", nId); nId++; graph.getNodes().add(node); node.setProperties(nodeProps); } SemanticGraph dependencies = null; switch (dependencyType) { case BASIC: dependencies = sentence.get(BasicDependenciesAnnotation.class); break; case COLLAPSED: dependencies = sentence.get(CollapsedDependenciesAnnotation.class); break; case COLLAPSED_CCPROCESSED: dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); break; default: dependencies = sentence.get(BasicDependenciesAnnotation.class); break; } dependencies.edgeListSorted(); List<DGRelation> relations = new ArrayList<DGRelation>(); for (IndexedWord node : dependencies.vertexListSorted()) { DGRelation relation = new DGRelation(); relation.setProperties(new HashMap<String, Object>()); DGNode child = graph.getDGNodeById(node.index()); relation.setTarget(child); Collection<IndexedWord> parentsTmp = dependencies.getParents(node); ArrayList<IndexedWord> parents = new ArrayList<IndexedWord>(); for (IndexedWord par : parentsTmp) { SemanticGraphEdge edge = dependencies.getEdge(par, node); DGNode parent = graph.getDGNodeById(edge.getGovernor().index()); if (parent.getProperties().get("id") != child.getProperties().get("id")) parents.add(par); } if (parents.isEmpty()) { relation.getProperties().put("type", "root"); relation.getProperties().put("fromId", new Integer(0)); relation.setSource(null); graph.setRoot(relation); } else { Iterator<IndexedWord> it = parents.iterator(); while (it.hasNext()) { IndexedWord par = it.next(); SemanticGraphEdge edge = dependencies.getEdge(par, node); DGNode parent = graph.getDGNodeById(edge.getGovernor().index()); relation.setSource(parent); relation.getProperties().put("fromId", parent.getProperties().get("id")); relation.getProperties().put("type", edge.getRelation().toString()); } } relations.add(relation); } graph.setRelations(relations); return graph; }
From source file:it.uniud.ailab.dcore.wrappers.external.StanfordBootstrapperAnnotator.java
License:Open Source License
/** * Annotate the document by splitting the document, tokenizing it, * performing PoS tagging and Named Entity Recognition using the Stanford * Core NLP tools./*from www . java 2s. co m*/ * * @param component the component to annotate. */ @Override public void annotate(Blackboard blackboard, DocumentComponent component) { if (pipeline == null) { // creates a StanfordCoreNLP object, with POS tagging, lemmatization, //NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, parse, lemma, ner, dcoref"); pipeline = new StanfordCoreNLP(props); } // read some text in the text variable String text = component.getText(); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); //get the graph for coreference resolution Map<Integer, CorefChain> graph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class); //prepare the map for coreference graph of document Map<String, Collection<Set<CorefChain.CorefMention>>> coreferenceGraph = new HashMap<>(); for (CorefChain corefChain : graph.values()) { //get the representative mention, that is the word recall in other sentences CorefChain.CorefMention cm = corefChain.getRepresentativeMention(); //eliminate auto-references if (corefChain.getMentionMap().size() <= 1) { continue; } //get the stemmed form of the references, so the comparison with //grams will be easier List<CoreLabel> tks = document.get(SentencesAnnotation.class).get(cm.sentNum - 1) .get(TokensAnnotation.class); //list of tokens which compose the anaphor List<Token> anaphorsTokens = new ArrayList<>(); for (int i = cm.startIndex - 1; i < cm.endIndex - 1; i++) { CoreLabel current = tks.get(i); Token t = new Token(current.word()); t.setPoS(current.tag()); t.setLemma(current.lemma()); anaphorsTokens.add(t); } //the mention n-gram which is formed by the anaphor and a //list of references Mention mention = new Mention(cm.mentionSpan, anaphorsTokens, cm.mentionSpan); //get map of the references to the corefchain obj Collection<Set<CorefChain.CorefMention>> mentionMap = corefChain.getMentionMap().values(); for (Set<CorefChain.CorefMention> mentions : mentionMap) { for (CorefChain.CorefMention reference : mentions) { //eliminate self-references if (reference.mentionSpan.equalsIgnoreCase(cm.mentionSpan)) { continue; } List<CoreLabel> tokens = document.get(SentencesAnnotation.class).get(reference.sentNum - 1) .get(TokensAnnotation.class); //list of tokens which compose the mention List<Token> mentionTokens = new ArrayList<>(); for (int i = reference.startIndex - 1; i < reference.endIndex - 1; i++) { CoreLabel current = tokens.get(i); //set token features Token t = new Token(current.word()); t.setPoS(current.tag()); t.setLemma(current.lemma()); mentionTokens.add(t); } //add to mention a new reference mention.addReference(reference.mentionSpan, mentionTokens, reference.mentionType.toString()); } } //assign to the document a new corenference obj //containing the anaphor and its mentions blackboard.addGram(mention); } // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and //has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); //A counter that keeps track of the number of phrases in a sentences int phraseCounter = 0; for (CoreMap stanfordSentence : sentences) { Sentence distilledSentence = new Sentence(stanfordSentence.toString(), "" + sentenceCounter++); distilledSentence.setLanguage(Locale.ENGLISH); //getting the dependency graph of the document so to count the number of phrases //ROOT sentences are the first level children in the parse tree; every ROOT sentence //is constitute by a group of clauses which can be the principal (main clauses) or not //(coordinate and subordinate). We use ROOT sentences as a starting point to find out all //the phrases present in the sentences themselves, checking out for the tag "S". Tree sentenceTree = stanfordSentence.get(TreeCoreAnnotations.TreeAnnotation.class); for (Tree sub : sentenceTree.subTreeList()) { if (sub.label().value().equals("S")) { phraseCounter++; } } //annotate the sentence with a new feature counting all the phrases //cointained in the sentence distilledSentence.addAnnotation(new FeatureAnnotation(DefaultAnnotations.PHRASES_COUNT, phraseCounter)); // traversing the words in the current sentence // for each token in the text, we create a new token annotate it // with the word representing it, its pos tag and its lemma for (CoreLabel token : stanfordSentence.get(TokensAnnotation.class)) { // this is the text of the token Token t = new Token(token.originalText()); // this is the POS tag of the token t.setPoS(token.tag()); // this is the lemma of the ttoken t.setLemma(token.lemma()); String ner = token.get(NamedEntityTagAnnotation.class); if (!ner.equalsIgnoreCase("O")) { t.addAnnotation(new NERAnnotation(DefaultAnnotations.IS_NER, ner)); } //add the token to the sentence distilledSentence.addToken(t); } //add the sentence to document ((DocumentComposite) component).addComponent(distilledSentence); } }
From source file:it.uniud.ailab.dcore.wrappers.external.StanfordFastBootstrapperAnnotator.java
License:Open Source License
/** * Annotate the document by splitting the document, tokenizing it, * performing PoS tagging and Named Entity Recognition using the Stanford * Core NLP tools.//from ww w. j ava 2s . co m * * @param component the component to annotate. */ @Override public void annotate(Blackboard blackboard, DocumentComponent component) { if (pipeline == null) { // creates a StanfordCoreNLP object, with POS tagging, lemmatization, //NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma"); pipeline = new StanfordCoreNLP(props); } // read some text in the text variable String text = component.getText(); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and //has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap stanfordSentence : sentences) { Sentence distilledSentence = new Sentence(stanfordSentence.toString(), "" + sentenceCounter++); distilledSentence.setLanguage(Locale.ENGLISH); // traversing the words in the current sentence // for each token in the text, we create a new token annotate it // with the word representing it, its pos tag and its lemma for (CoreLabel token : stanfordSentence.get(TokensAnnotation.class)) { // this is the text of the token Token t = new Token(token.originalText()); // this is the POS tag of the token t.setPoS(token.tag()); // this is the lemma of the ttoken t.setLemma(token.lemma()); //add the token to the sentence distilledSentence.addToken(t); } //add the sentence to document ((DocumentComposite) component).addComponent(distilledSentence); } }
From source file:linguistic.Frame.java
public void createMap(String text, Map<String, WordInfo> map) throws IOException, ClassNotFoundException { String tag = ""; Annotation document = null;// w ww .j a v a 2 s.c om document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel word1 : sentence.get(CoreAnnotations.TokensAnnotation.class)) { tag = word1.tag(); textTag += word1.originalText() + "_" + tag + " "; String word = word1.originalText().toLowerCase(); char c = word.charAt(0); if (!tag.equals("CD") && !tag.equals("CC") && !tag.equals("NNS") && !tag.equals("NNP") && !tag.equals(".") && !tag.equals(":") && !tag.equals(";") && !tag.equals(",") && isLetter(c)) { if (map.containsKey(word)) { int fr = map.get(word).getFrequency() + 1; if (map.get(word).getTag().contains("/" + tag + "/")) { wordTagAmountMap.put(word + " - " + tag, wordTagAmountMap.get(word + " - " + tag) + 1); map.put(word, new WordInfo(fr, map.get(word).getTag(), getBasicForm(word))); } else { wordTagAmountMap.put(word + " - " + tag, 1); map.put(word, new WordInfo(fr, map.get(word).getTag() + tag + "/", getBasicForm(word))); } } else { wordTagAmountMap.put(word + " - " + tag, 1); map.put(word, new WordInfo(1, "/" + tag + "/", getBasicForm(word))); } } } } /*StringBuffer word = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if ( isLetter(c) ) { word.append(toLower(c)); } else { if ( word.length() != 0 ) { if(c == 150 || c == 151 || c == 45){ if(c > 0 && (text.charAt(i - 1) == 150 || text.charAt(i - 1) == 151 || text.charAt(i - 1) == 45 || isLetter(text.charAt(i + 1)) == false) ){ continue; } word.append(c); } else{ String tag = getTagger(word.toString()); /*if(wordsDictionaryMap.get(word).getTag().contains("/" + tag + "/")) { wordTagAmountMap.put(word + " - " + tag, wordTagAmountMap.get(word + " - " + tag) + 1); wordsDictionaryMap.put(word, new WordInfo(fr, wordsDictionaryMap.get(word).getTag(), getBasicForm(word))); } else { wordTagAmountMap.put(word + " - " + tag, 1); wordsDictionaryMap.put(word, new WordInfo(fr, wordsDictionaryMap.get(word).getTag() + tag + "/", getBasicForm(word))); }*/ /*if (wordsDictionaryMap.containsKey(word.toString())) { int fr = wordsDictionaryMap.get(word.toString()).getFrequency() + 1; if(wordsDictionaryMap.get(word.toString()).getTag().contains("/" + tag + "/")) { wordTagAmountMap.put(word.toString() + " - " + tag, wordTagAmountMap.get(word.toString() + " - " + tag) + 1); wordsDictionaryMap.put(word.toString(), new WordInfo(fr, wordsDictionaryMap.get(word.toString()).getTag(), getBasicForm(word.toString()))); } else { wordTagAmountMap.put(word + " - " + tag, 1); wordsDictionaryMap.put(word.toString(), new WordInfo(fr, wordsDictionaryMap.get(word.toString()).getTag() + tag + "/", getBasicForm(word.toString()))); } //wordsMap.put(word.toString(), new WordInfo(fr, getTagger(word.toString()), getBasicForm(word.toString()))); } else { wordTagAmountMap.put(word + " - " + tag, 1); wordsDictionaryMap.put(word.toString(), new WordInfo(1, "/" + getTagger(word.toString() + "/"), getBasicForm(word.toString()))); } word.delete(0, word.length()); } } } }*/ }