List of usage examples for edu.stanford.nlp.ling TaggedWord tag
String tag
To view the source code for edu.stanford.nlp.ling TaggedWord tag.
Click Source Link
From source file:asap.textprocessing.TextProcessPOSTagsStanford.java
/** * Uses loaded tagger model to calculate POS tags for the given sentence * tokens// w w w.j av a 2s . c om * * @param tokens * @return tags */ @Override protected synchronized String[] getTags(String[] tokens) { String sentence = ""; for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; sentence += token; if (i + 1 < tokens.length) { sentence += " "; } } List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence)); String tags[] = null; for (List<HasWord> sentenceL : sentences) { List<TaggedWord> taggedSentence = tagger.tagSentence(sentenceL); tags = new String[taggedSentence.size()]; for (int j = 0; j < taggedSentence.size(); j++) { TaggedWord taggedWord = taggedSentence.get(j); tags[j] = taggedWord.tag(); } } return tags; }
From source file:at.illecker.storm.commons.dict.SentimentDictionary.java
License:Apache License
public Map<Integer, SentimentResult> getSentenceSentimentFromTaggedWord(List<TaggedWord> sentence) { Map<Integer, SentimentResult> sentenceSentiments = new HashMap<Integer, SentimentResult>(); if (LOGGING) { LOG.info("TaggedSentence: " + sentence.toString()); }// w w w. jav a2 s . co m for (TaggedWord word : sentence) { Map<Integer, Double> wordSentiments = getWordSentiment(word.word(), word.tag(), true); if (wordSentiments != null) { for (Map.Entry<Integer, Double> wordSentiment : wordSentiments.entrySet()) { int key = wordSentiment.getKey(); double sentimentScore = wordSentiment.getValue(); SentimentResult sentimentResult = sentenceSentiments.get(key); if (sentimentResult == null) { sentimentResult = new SentimentResult(); } // add score value sentimentResult.addScore(sentimentScore); // update sentimentResult sentenceSentiments.put(key, sentimentResult); } } } if (LOGGING) { LOG.info("Sentiment: " + sentenceSentiments); } return (sentenceSentiments.size() > 0) ? sentenceSentiments : null; }
From source file:at.illecker.storm.commons.svm.featurevector.POSFeatureVectorGenerator.java
License:Apache License
private double[] countPOSTagsFromTaggedWords(List<TaggedWord> taggedWords, boolean normalize) { // 7 = [NOUN, VERB, ADJECTIVE, ADVERB, INTERJECTION, PUNCTUATION, HASHTAG] double[] posTags = new double[] { 0d, 0d, 0d, 0d, 0d, 0d, 0d }; int wordCount = 0; for (TaggedWord word : taggedWords) { wordCount++;//www . j ava2 s .c o m String pennTag = word.tag(); if (pennTag.startsWith("NN")) { posTags[0]++; } else if (pennTag.startsWith("VB")) { posTags[1]++; } else if (pennTag.startsWith("JJ")) { posTags[2]++; } else if (pennTag.startsWith("RB")) { posTags[3]++; } else if (pennTag.startsWith("UH")) { posTags[4]++; } else if ((pennTag.equals(".")) || (pennTag.equals(":"))) { posTags[5]++; } else if (pennTag.startsWith("HT")) { posTags[6]++; } } if (normalize) { for (int i = 0; i < posTags.length; i++) { posTags[i] /= wordCount; } } return posTags; }
From source file:at.illecker.storm.commons.tfidf.TweetTfIdf.java
License:Apache License
public static Map<String, Double> tfFromTaggedWords(List<TaggedWord> tweet, TfType type, boolean usePOSTags) { Map<String, Double> termFreq = new LinkedHashMap<String, Double>(); WordNet wordNet = WordNet.getInstance(); StopWords stopWords = StopWords.getInstance(); List<String> words = new ArrayList<String>(); for (TaggedWord taggedWord : tweet) { String word = taggedWord.word().toLowerCase(); String pennTag = taggedWord.tag(); if ((!pennTag.equals(".")) && (!pennTag.equals(",")) && (!pennTag.equals(":")) && (!pennTag.equals("''")) && (!pennTag.equals("(")) && (!pennTag.equals(")")) && (!pennTag.equals("URL")) && (!pennTag.equals("USR")) && (!pennTag.equals("CC")) && (!pennTag.equals("CD")) && (!pennTag.equals("SYM")) && (!pennTag.equals("POS")) && (!stopWords.isStopWord(word))) { // Remove hashtag if (pennTag.equals("HT")) { word = word.substring(1); }/*from w w w .j a v a2 s . c o m*/ // Check if word consists of punctuations // if (StringUtils.consitsOfPunctuations(word) // && (!pennTag.equals("POS"))) { // continue; // } // Check if word starts with an alphabet if (!StringUtils.startsWithAlphabeticChar(word)) { continue; } POS posTag = POSTag.convertPTB(pennTag); // LOG.info("word: '" + word + "' pennTag: '" + pennTag + "' tag: '" // + posTag + "'"); // word stemming List<String> stems = wordNet.findStems(word, posTag); if (!stems.isEmpty()) { word = stems.get(0); } // add word to term frequency if (usePOSTags) { words.add(word + ((posTag != null) ? "#" + POSTag.toString(posTag) : "")); } else { words.add(word); } } } termFreq = TfIdf.tf(termFreq, words); termFreq = TfIdf.normalizeTf(termFreq, type); return termFreq; }
From source file:at.illecker.storm.commons.wordnet.WordNet.java
License:Apache License
public ISynset disambiguateWordSenses(List<TaggedWord> sentence, String word, POS pos) { IIndexWord indexWord = getIndexWord(word, pos); Set<ISynset> synsets = getSynsets(indexWord); ISynset resultSynset = null;/* w ww .ja v a2 s .com*/ double bestScore = 0; for (ISynset synset : synsets) { for (TaggedWord taggedWord : sentence) { double score = 0; IIndexWord indexWordLocal = getIndexWord(taggedWord.word(), POSTag.convertPTB(taggedWord.tag())); Set<ISynset> synsetsLocal = getSynsets(indexWordLocal); for (ISynset synsetLocal : synsetsLocal) { double sim = shortestPathDistance(synsetLocal, synset); if (sim > 0) { score += sim; } } if (score > bestScore) { bestScore = score; resultSynset = synset; } } } return resultSynset; }
From source file:cc.clabs.stratosphere.mlp.types.PactWord.java
License:BEER-WARE LICENSE
/** * Constructor for PactWord. Replaces some odd conversions * from the Stanford Tagger.//from w w w. j av a 2s .c om * * @param word a TaggedWord (@see edu.stanford.nlp.ling.TaggedWord) */ public PactWord(TaggedWord word) { String v = word.value(); String t = word.tag(); if (v.equals("-LRB-")) v = "("; if (v.equals("-RRB-")) v = ")"; if (v.equals("-LCB-")) v = "{"; if (v.equals("-RCB-")) v = "}"; if (t.equals("``")) t = "\""; if (t.equals("''")) t = "\""; if (v.equals("``")) v = "\""; if (v.equals("''")) v = "\""; if (v.equals("--")) v = "-"; this.setWord(v); this.setTag(t); }
From source file:context.core.task.pos.POSBody.java
License:Open Source License
/** * * @return//w ww. j a va2s .c o m */ public boolean tagPOS() { List<List<String[]>> toAggregate = new ArrayList<List<String[]>>(); List<FileData> files = input.getFiles(); try { for (FileData ff : files) { File file = ff.getFile(); String text; List<String[]> POStags = new ArrayList<String[]>(); try { text = JavaIO.readFile(file); if (instance.getLanguage().equals("en")) { text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); } Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods final List<CoreLabel> sent = sentence.get(TokensAnnotation.class); final List<TaggedWord> taggedWords = POSTagger.tag(sent, instance.getLanguage()); for (TaggedWord token : taggedWords) { // this is the text of the token String word = token.word(); // this is the POS tag of the token String pos = token.tag(); String[] entity = { word, pos, Integer.toString(1) }; if (instance.getLanguage().equals("en")) { if (!word.matches("^[a-zA-Z0-9]*$")) { continue; } } POStags.add(entity); } } toAggregate.add(POStags); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } } POStagsWithCount = new CorpusAggregator().CorpusAggregate(toAggregate); } catch (Exception e) { e.printStackTrace(); return false; } return true; }
From source file:context.core.task.stemming.LemmaTagger.java
License:Open Source License
/** * * @param args/*from www . ja v a 2 s . co m*/ * @throws ClassNotFoundException * @throws IOException */ public static void main(String[] args) throws ClassNotFoundException, IOException { // Initialize the tagger MaxentTagger tagger = getTagger("en"); // The sample string // String sample = " "; // String sample = "This question appears to be off-topic. The users who voted to close gave this specific reason."; // The tagged string // String tagged = tagger.tagString(sample); // Output the result // System.out.println(tagged); List<CoreLabel> sent = Sentence.toCoreLabelList("These", "are", "some", "questions"); final List<TaggedWord> lemmatize = lemmatize(sent, "en"); System.out.println("Lemmatize::"); System.out.println(lemmatize); for (TaggedWord c : lemmatize) { System.out.println(c.word() + "\t" + c.tag()); } }
From source file:context.core.task.stemming.LemmaTagger.java
License:Open Source License
/** * * @param sent//from w w w .j av a 2s. co m * @param language * @return */ public static List<TaggedWord> lemmatize(List<CoreLabel> sent, String language) { MaxentTagger tagger = getTagger(language); // List<HasWord> sent = Sentence.toWordList("This is a sample text"); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord token : taggedSent) { String word = token.word(); String pos = token.tag(); String lemma = morphology.lemmatize(new WordTag(word, pos)).lemma(); token.setTag(lemma); } // final List<WordLemmaTag> tagged = (List<WordLemmaTag>) tagger.tagCoreLabelsOrHasWords(sent, morphology, true); // for (TaggedWord tw : taggedSent) { // System.out.println(tw.word() + "\t" + tw.tag()); // } return taggedSent; }
From source file:context.core.tokenizer.Tokenizer.java
License:Open Source License
/** * * @param text// ww w . j a v a 2s . com * @param docId * @return */ public static Map<String, CustomToken> tokenize(String text, String docId) { Map<String, CustomToken> customTokens = new LinkedHashMap<>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); int sentIndex = 0; for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods int index = 0; final List<CoreLabel> sent = sentence.get(TokensAnnotation.class); final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en"); for (TaggedWord token : taggedWords) { // this is the text of the token String word = token.word(); // this is the POS tag of the token String pos = token.tag(); // this is the NER label of the token // String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); CustomToken ctoken = new CustomToken(); ctoken.setWord(word); ctoken.setBeginPosition(token.beginPosition()); ctoken.setEndPosition(token.endPosition()); ctoken.setDocId(docId); ctoken.setSentenceIndex(sentIndex); ctoken.setMultiword(false); ctoken.setIndex(index); ctoken.setPos(pos); customTokens.put(word + "/" + docId + "/" + sentIndex + "/" + index, ctoken); index++; } sentIndex++; } return customTokens; }