List of usage examples for edu.stanford.nlp.ling CoreLabel word
@Override
public String word()
From source file:NERServer.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("usage: java NERServer modelpath"); System.exit(1);/* ww w.j av a2 s . c o m*/ } CRFClassifier crf = CRFClassifier.getClassifier(args[0]); BufferedReader input = new BufferedReader(new InputStreamReader(System.in), 1); for (;;) { String ln = input.readLine(); if (ln == null) { break; } List<List<CoreLabel>> out = crf.classify(ln); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { String label = word.get(CoreAnnotations.AnswerAnnotation.class); System.out.print(word.word() + '/' + label + ' '); } } System.out.print('\n'); } }
From source file:ca.ualberta.exemplar.core.ParserMalt.java
License:Open Source License
private String[] sentenceToCoNLLInput(List<CoreLabel> tokens) { List<String> conllList = new ArrayList<String>(100); int num = 1;//from ww w .ja va2s .com for (CoreLabel token : tokens) { String word = token.word(); String lemmaA = token.lemma(); String lemma = lemmaA != null && lemmaA.length() > 0 ? lemmaA : "_"; String posA = token.get(PartOfSpeechAnnotation.class); String pos = posA != null && posA.length() > 0 ? posA : "_"; conllList.add(num + "\t" + word + "\t" + lemma + "\t" + pos + "\t" + pos + "\t" + "_"); num++; } String[] conll = new String[conllList.size()]; conll = conllList.toArray(conll); return conll; }
From source file:com.asimihsan.handytrowel.nlp.StopwordAnnotator.java
License:Open Source License
@Override public void annotate(Annotation annotation) { if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); for (CoreLabel token : tokens) { boolean isWordStopword = stopwords.contains(token.word().toLowerCase()); boolean isLemmaStopword = checkLemma ? stopwords.contains(token.lemma().toLowerCase()) : false; Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword); token.set(StopwordAnnotator.class, pair); }/* w w w . j av a 2 s . c om*/ } }
From source file:com.asimihsan.handytrowel.nlp.TextAnalyzer.java
License:Open Source License
public TextAnalyzer analyze() { // Stanford CoreNLP, avoid lemmatization as it's very slow to use Porter2 stemming // instead. (Porter -> Snowball (Porter2) -> Lancaster is order of stemming // aggressiveness. ///* w ww . j a va 2s.co m*/ // other ideas // - remove top 10k most common english words Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, stopword"); props.setProperty("customAnnotatorClass.stopword", "com.asimihsan.handytrowel.nlp.StopwordAnnotator"); List<String> stopWords = null; try { stopWords = WordReader.wordReaderWithResourcePath("/nlp/top1000words.txt").getWords(); } catch (IOException e) { e.printStackTrace(); return this; } String customStopWordList = Joiner.on(",").join(stopWords); props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(body); pipeline.annotate(document); List<CoreLabel> inputTokens = document.get(CoreAnnotations.TokensAnnotation.class); SnowballStemmer stemmer = new englishStemmer(); for (CoreLabel token : inputTokens) { Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class); if (stopword.first()) continue; String word = token.word().toLowerCase(); //!!AI TODO this sucks, should make another annotator and make it optional etc. //also we're matching full stops! so we lose sentence information. if (punctuation.matcher(word).matches()) continue; //!AI TODO again this would be its own annotator and optional word = number.matcher(word).replaceAll("NUMBER"); stemmer.setCurrent(word); stemmer.stem(); word = stemmer.getCurrent(); tokens.add(word); } return this; }
From source file:com.epictodo.controller.nlp.SentenceAnalysis.java
License:Open Source License
/** * This method identify and extract NER entities such as Name, Person, Date, Time, Organization, Location * * @param _sentence// w w w . j a va 2s .c o m * @return _results */ public LinkedHashMap<String, LinkedHashSet<String>> nerEntitiesExtractor(String _sentence) { LinkedHashMap<String, LinkedHashSet<String>> _results = new <String, LinkedHashSet<String>>LinkedHashMap(); CRFClassifier<CoreLabel> _classifier = load_engine.CLASSIFIER; //CRFClassifier.getClassifierNoExceptions(CLASSIFIER_MODEL); List<List<CoreLabel>> _classify = _classifier.classify(_sentence); for (List<CoreLabel> _tokens : _classify) { for (CoreLabel _token : _tokens) { String _word = _token.word(); String _category = _token.get(CoreAnnotations.AnswerAnnotation.class); if (!"O".equals(_category)) { if (_results.containsKey(_category)) { // Key already exists, insert to LinkedHashMap _results.get(_category).add(_word); } else { LinkedHashSet<String> _temp = new LinkedHashSet<>(); _temp.add(_word); _results.put(_category, _temp); } } } } return _results; }
From source file:com.github.kutschkem.Qgen.annotators.OvergeneratorPermutation.java
License:Open Source License
private String joinWithWhiteSpaces(List<CoreLabel> labels) { StringBuilder buf = new StringBuilder(); for (CoreLabel l : labels) { buf.append(' '); buf.append(l.word()); }/*from w w w .ja v a2 s . c o m*/ String result = buf.toString().trim().replaceAll("\\s+", " "); result = result.replaceAll("\\s(?=\\p{Punct})", ""); // remove leading whitespaces of punctuation result = result.replaceFirst("^.", result.substring(0, 1).toUpperCase()); //First letter uppercase return result; }
From source file:com.graphbrain.eco.StanfordLemmatizer.java
License:Open Source License
public List<String> lemmatize(String documentText, int returnType) { List<String> words = new LinkedList<>(); List<String> lemmas = new LinkedList<>(); // create an empty Annotation just with the given text Annotation document = new Annotation(documentText); // run all Annotators on this text this.pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas words.add(token.word()); lemmas.add(token.lemma());/*ww w . ja v a 2s .co m*/ // lemmas.add(token.get(LemmaAnnotation.class)); } } if (returnType == 0) { return lemmas; } else { return words; } }
From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();/*w w w. j a v a2 s.co m*/ String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible") .tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.valueOf(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.valueOf(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // term things List<List<Mention>> termMentions = new ArrayList<List<Mention>>(); if (use_term) { String dataCrf = ""; System.err.print("FEAT TYPE: "); System.err .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)); if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP) .equals(MyConstants.TTE_FEATURE_NLTK)) { dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno); } else { dataCrf = CrfFormatter.annotationToCrfString(docAnno); } List<List<String>> tagResult = new ArrayList<List<String>>(); try { tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL)); if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) { String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA); File crfDataFile = new File(crfDataFilename); BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile)); bw.write(dataCrf); bw.close(); } } catch (Exception e) { System.err.println("Crfsuite tag failed"); } termAsMentionFinder.setTags(tagResult); termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); maxID = termAsMentionFinder.getMaxID(); } // extract predicted mentions allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) { termMentions = injectPronoun(termMentions, allPredictedMentions); } if (experimentType != null) { if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) { List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) { List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) { allPredictedMentions = termMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) { List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else { System.err.println(experimentType); System.err.println("Unknown experiment type. Using mention detector."); } } else if (useGoldMention) { allPredictedMentions = allGoldMentions; } // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
From source file:conditionalCFG.ConditionalCFGParser.java
License:Open Source License
private CoreLabel getCoreLabel(int labelIndex) { if (originalCoreLabels[labelIndex] != null) { CoreLabel terminalLabel = originalCoreLabels[labelIndex]; if (terminalLabel.value() == null && terminalLabel.word() != null) { terminalLabel.setValue(terminalLabel.word()); }/*from w w w. j a v a 2s . c om*/ return terminalLabel; } String wordStr = wordIndex.get(words[labelIndex]); CoreLabel terminalLabel = new CoreLabel(); terminalLabel.setValue(wordStr); terminalLabel.setWord(wordStr); terminalLabel.setBeginPosition(beginOffsets[labelIndex]); terminalLabel.setEndPosition(endOffsets[labelIndex]); if (originalTags[labelIndex] != null) { terminalLabel.setTag(originalTags[labelIndex].tag()); } return terminalLabel; }
From source file:coreferenceresolver.util.StanfordUtil.java
public void init(boolean simpleInit) throws FileNotFoundException, IOException { String outPosFilePath = "./input.txt.pos"; FileWriter fw = new FileWriter(new File(outPosFilePath)); BufferedWriter bw = new BufferedWriter(fw); props = new Properties(); if (simpleInit) { props.put("annotators", "tokenize, ssplit, pos, parse"); } else {/* w w w . j av a2 s .co m*/ props.put("annotators", "tokenize, ssplit, pos, parse, sentiment"); } pipeline = new StanfordCoreNLP(props); reviews = new ArrayList<>(); FileReader fileReader = new FileReader(documentFile); BufferedReader bufferedReader = new BufferedReader(fileReader); String reviewLine; int reviewId = 0; int sentenceId; //read input file line by line and count the number sentences of each lines while ((reviewLine = bufferedReader.readLine()) != null) { sentenceId = 0; Review newReview = new Review(); //Add to reviews list newReview.setRawContent(reviewLine); // create an empty Annotation just with the given text document = new Annotation(reviewLine); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); //Begin extracting from paragraphs for (CoreMap sentence : sentences) { int sentenceOffsetBegin = sentence.get(CharacterOffsetBeginAnnotation.class); int sentenceOffsetEnd = sentence.get(CharacterOffsetEndAnnotation.class); Sentence newSentence = new Sentence(); newSentence.setReviewId(reviewId); newSentence.setRawContent(sentence.toString()); newSentence.setOffsetBegin(sentenceOffsetBegin); newSentence.setOffsetEnd(sentenceOffsetEnd); if (!simpleInit) { int sentimentLevel = RNNCoreAnnotations .getPredictedClass(sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class)); newSentence.setSentimentLevel(sentimentLevel); //Dependency Parsing SemanticGraph collCCDeps = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); Collection<TypedDependency> typedDeps = collCCDeps.typedDependencies(); newSentence.setDependencies(typedDeps); } List<Tree> sentenceTreeLeaves = sentence.get(TreeCoreAnnotations.TreeAnnotation.class).getLeaves(); int i = 0; for (CoreLabel token : sentence.get(TokensAnnotation.class)) { Token newToken = new Token(); Tree tokenTree = sentenceTreeLeaves.get(i); newToken.setTokenTree(tokenTree); String word = token.get(TextAnnotation.class); newToken.setWord(word); String pos = token.get(PartOfSpeechAnnotation.class); newToken.setPOS(pos); int offsetBegin = token.get(CharacterOffsetBeginAnnotation.class); newToken.setOffsetBegin(offsetBegin); int offsetEnd = token.get(CharacterOffsetEndAnnotation.class); newToken.setOffsetEnd(offsetEnd); if (!simpleInit) { //Check NP relative clause Tree twoLevelsAncestor = tokenTree.ancestor(2, sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); if (twoLevelsAncestor.value().equals("WHNP") && !word.toLowerCase().equals("who") && !word.toLowerCase().equals("what")) { newToken.setRelativePronoun(true); } //Calculate sentiment for this token int newTokenSentiment = Util.retrieveOriginalSentiment(newToken.getWord()); newToken.setSentimentOrientation(newTokenSentiment, newSentence.getDependencies()); } newSentence.addToken(newToken); bw.write(token.word() + "/" + token.tag() + " "); ++i; } bw.newLine(); if (!simpleInit) { //Check if this sentence contains a comparative indicator. //If yes, it is a comparative sentence. Identify which NP is superior or inferior in this sentence List<Token> comparativeTokens = FeatureExtractor.findComparativeIndicator(newSentence, null, null); //TODO //Check special comparative samples if (!comparativeTokens.isEmpty()) { newSentence.initComparatives(comparativeTokens); } } newReview.addSentence(newSentence); ++sentenceId; } bw.write("./."); bw.newLine(); reviews.add(newReview); ++reviewId; } bw.close(); }