Example usage for edu.stanford.nlp.ling TaggedWord tag

List of usage examples for edu.stanford.nlp.ling TaggedWord tag

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling TaggedWord tag.

Prototype

String tag

To view the source code for edu.stanford.nlp.ling TaggedWord tag.

Click Source Link

Usage

From source file:asap.textprocessing.TextProcessPOSTagsStanford.java

/**
 * Uses loaded tagger model to calculate POS tags for the given sentence
 * tokens//  w w  w.j av  a 2s  .  c om
 *
 * @param tokens
 * @return tags
 */
@Override
protected synchronized String[] getTags(String[] tokens) {
    String sentence = "";
    for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        sentence += token;
        if (i + 1 < tokens.length) {
            sentence += " ";
        }

    }

    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence));
    String tags[] = null;
    for (List<HasWord> sentenceL : sentences) {
        List<TaggedWord> taggedSentence = tagger.tagSentence(sentenceL);
        tags = new String[taggedSentence.size()];
        for (int j = 0; j < taggedSentence.size(); j++) {
            TaggedWord taggedWord = taggedSentence.get(j);
            tags[j] = taggedWord.tag();
        }
    }
    return tags;
}

From source file:at.illecker.storm.commons.dict.SentimentDictionary.java

License:Apache License

public Map<Integer, SentimentResult> getSentenceSentimentFromTaggedWord(List<TaggedWord> sentence) {
    Map<Integer, SentimentResult> sentenceSentiments = new HashMap<Integer, SentimentResult>();
    if (LOGGING) {
        LOG.info("TaggedSentence: " + sentence.toString());
    }//  w  w  w. jav a2 s  . co  m
    for (TaggedWord word : sentence) {
        Map<Integer, Double> wordSentiments = getWordSentiment(word.word(), word.tag(), true);
        if (wordSentiments != null) {
            for (Map.Entry<Integer, Double> wordSentiment : wordSentiments.entrySet()) {

                int key = wordSentiment.getKey();
                double sentimentScore = wordSentiment.getValue();

                SentimentResult sentimentResult = sentenceSentiments.get(key);
                if (sentimentResult == null) {
                    sentimentResult = new SentimentResult();
                }
                // add score value
                sentimentResult.addScore(sentimentScore);
                // update sentimentResult
                sentenceSentiments.put(key, sentimentResult);
            }
        }
    }
    if (LOGGING) {
        LOG.info("Sentiment: " + sentenceSentiments);
    }
    return (sentenceSentiments.size() > 0) ? sentenceSentiments : null;
}

From source file:at.illecker.storm.commons.svm.featurevector.POSFeatureVectorGenerator.java

License:Apache License

private double[] countPOSTagsFromTaggedWords(List<TaggedWord> taggedWords, boolean normalize) {
    // 7 = [NOUN, VERB, ADJECTIVE, ADVERB, INTERJECTION, PUNCTUATION, HASHTAG]
    double[] posTags = new double[] { 0d, 0d, 0d, 0d, 0d, 0d, 0d };
    int wordCount = 0;
    for (TaggedWord word : taggedWords) {
        wordCount++;//www .  j  ava2  s .c o  m
        String pennTag = word.tag();
        if (pennTag.startsWith("NN")) {
            posTags[0]++;
        } else if (pennTag.startsWith("VB")) {
            posTags[1]++;
        } else if (pennTag.startsWith("JJ")) {
            posTags[2]++;
        } else if (pennTag.startsWith("RB")) {
            posTags[3]++;
        } else if (pennTag.startsWith("UH")) {
            posTags[4]++;
        } else if ((pennTag.equals(".")) || (pennTag.equals(":"))) {
            posTags[5]++;
        } else if (pennTag.startsWith("HT")) {
            posTags[6]++;
        }
    }
    if (normalize) {
        for (int i = 0; i < posTags.length; i++) {
            posTags[i] /= wordCount;
        }
    }
    return posTags;
}

From source file:at.illecker.storm.commons.tfidf.TweetTfIdf.java

License:Apache License

public static Map<String, Double> tfFromTaggedWords(List<TaggedWord> tweet, TfType type, boolean usePOSTags) {
    Map<String, Double> termFreq = new LinkedHashMap<String, Double>();
    WordNet wordNet = WordNet.getInstance();
    StopWords stopWords = StopWords.getInstance();

    List<String> words = new ArrayList<String>();
    for (TaggedWord taggedWord : tweet) {
        String word = taggedWord.word().toLowerCase();
        String pennTag = taggedWord.tag();

        if ((!pennTag.equals(".")) && (!pennTag.equals(",")) && (!pennTag.equals(":"))
                && (!pennTag.equals("''")) && (!pennTag.equals("(")) && (!pennTag.equals(")"))
                && (!pennTag.equals("URL")) && (!pennTag.equals("USR")) && (!pennTag.equals("CC"))
                && (!pennTag.equals("CD")) && (!pennTag.equals("SYM")) && (!pennTag.equals("POS"))
                && (!stopWords.isStopWord(word))) {

            // Remove hashtag
            if (pennTag.equals("HT")) {
                word = word.substring(1);
            }/*from   w w w .j  a v a2  s  .  c  o  m*/

            // Check if word consists of punctuations
            // if (StringUtils.consitsOfPunctuations(word)
            // && (!pennTag.equals("POS"))) {
            // continue;
            // }

            // Check if word starts with an alphabet
            if (!StringUtils.startsWithAlphabeticChar(word)) {
                continue;
            }

            POS posTag = POSTag.convertPTB(pennTag);
            // LOG.info("word: '" + word + "' pennTag: '" + pennTag + "' tag: '"
            // + posTag + "'");

            // word stemming
            List<String> stems = wordNet.findStems(word, posTag);
            if (!stems.isEmpty()) {
                word = stems.get(0);
            }

            // add word to term frequency
            if (usePOSTags) {
                words.add(word + ((posTag != null) ? "#" + POSTag.toString(posTag) : ""));
            } else {
                words.add(word);
            }
        }
    }
    termFreq = TfIdf.tf(termFreq, words);
    termFreq = TfIdf.normalizeTf(termFreq, type);
    return termFreq;
}

From source file:at.illecker.storm.commons.wordnet.WordNet.java

License:Apache License

public ISynset disambiguateWordSenses(List<TaggedWord> sentence, String word, POS pos) {
    IIndexWord indexWord = getIndexWord(word, pos);
    Set<ISynset> synsets = getSynsets(indexWord);

    ISynset resultSynset = null;/* w  ww  .ja v a2  s .com*/
    double bestScore = 0;
    for (ISynset synset : synsets) {
        for (TaggedWord taggedWord : sentence) {
            double score = 0;
            IIndexWord indexWordLocal = getIndexWord(taggedWord.word(), POSTag.convertPTB(taggedWord.tag()));
            Set<ISynset> synsetsLocal = getSynsets(indexWordLocal);
            for (ISynset synsetLocal : synsetsLocal) {
                double sim = shortestPathDistance(synsetLocal, synset);
                if (sim > 0) {
                    score += sim;
                }
            }
            if (score > bestScore) {
                bestScore = score;
                resultSynset = synset;
            }
        }
    }
    return resultSynset;
}

From source file:cc.clabs.stratosphere.mlp.types.PactWord.java

License:BEER-WARE LICENSE

/**
 * Constructor for PactWord. Replaces some odd conversions
 * from the Stanford Tagger.//from  w w w.  j  av  a  2s .c  om
 * 
 * @param word a TaggedWord (@see edu.stanford.nlp.ling.TaggedWord)
 */
public PactWord(TaggedWord word) {
    String v = word.value();
    String t = word.tag();
    if (v.equals("-LRB-"))
        v = "(";
    if (v.equals("-RRB-"))
        v = ")";
    if (v.equals("-LCB-"))
        v = "{";
    if (v.equals("-RCB-"))
        v = "}";
    if (t.equals("``"))
        t = "\"";
    if (t.equals("''"))
        t = "\"";
    if (v.equals("``"))
        v = "\"";
    if (v.equals("''"))
        v = "\"";
    if (v.equals("--"))
        v = "-";
    this.setWord(v);
    this.setTag(t);
}

From source file:context.core.task.pos.POSBody.java

License:Open Source License

/**
 *
 * @return//w ww. j a  va2s .c  o m
 */
public boolean tagPOS() {
    List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();

    List<FileData> files = input.getFiles();
    try {
        for (FileData ff : files) {

            File file = ff.getFile();
            String text;
            List<String[]> POStags = new ArrayList<String[]>();
            try {
                text = JavaIO.readFile(file);
                if (instance.getLanguage().equals("en")) {
                    text = text.replaceAll("\\p{Cc}", " ");
                    text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
                }
                Annotation document = new Annotation(text);
                pipeline.annotate(document);

                List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                for (CoreMap sentence : sentences) {
                    // traversing the words in the current sentence
                    // a CoreLabel is a CoreMap with additional token-specific methods
                    final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);

                    final List<TaggedWord> taggedWords = POSTagger.tag(sent, instance.getLanguage());

                    for (TaggedWord token : taggedWords) {
                        // this is the text of the token
                        String word = token.word();
                        // this is the POS tag of the token
                        String pos = token.tag();
                        String[] entity = { word, pos, Integer.toString(1) };
                        if (instance.getLanguage().equals("en")) {
                            if (!word.matches("^[a-zA-Z0-9]*$")) {
                                continue;
                            }
                        }
                        POStags.add(entity);
                    }
                }
                toAggregate.add(POStags);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                return false;
            }
        }
        POStagsWithCount = new CorpusAggregator().CorpusAggregate(toAggregate);
    } catch (Exception e) {
        e.printStackTrace();
        return false;
    }
    return true;
}

From source file:context.core.task.stemming.LemmaTagger.java

License:Open Source License

/**
 *
 * @param args/*from   www .  ja v a  2  s . co  m*/
 * @throws ClassNotFoundException
 * @throws IOException
 */
public static void main(String[] args) throws ClassNotFoundException, IOException {

    // Initialize the tagger
    MaxentTagger tagger = getTagger("en");

    // The sample string
    //        String sample = "    ";
    //        String sample = "This question appears to be off-topic. The users who voted to close gave this specific reason.";
    // The tagged string
    //        String tagged = tagger.tagString(sample);
    // Output the result
    //        System.out.println(tagged);
    List<CoreLabel> sent = Sentence.toCoreLabelList("These", "are", "some", "questions");
    final List<TaggedWord> lemmatize = lemmatize(sent, "en");
    System.out.println("Lemmatize::");
    System.out.println(lemmatize);
    for (TaggedWord c : lemmatize) {
        System.out.println(c.word() + "\t" + c.tag());
    }
}

From source file:context.core.task.stemming.LemmaTagger.java

License:Open Source License

/**
 *
 * @param sent//from  w w w .j av a 2s. co  m
 * @param language
 * @return
 */
public static List<TaggedWord> lemmatize(List<CoreLabel> sent, String language) {
    MaxentTagger tagger = getTagger(language);
    //        List<HasWord> sent = Sentence.toWordList("This is a sample text");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord token : taggedSent) {
        String word = token.word();
        String pos = token.tag();
        String lemma = morphology.lemmatize(new WordTag(word, pos)).lemma();
        token.setTag(lemma);
    }
    //        final List<WordLemmaTag> tagged = (List<WordLemmaTag>) tagger.tagCoreLabelsOrHasWords(sent, morphology, true);
    //        for (TaggedWord tw : taggedSent) {
    //            System.out.println(tw.word() + "\t" + tw.tag());
    //        }
    return taggedSent;
}

From source file:context.core.tokenizer.Tokenizer.java

License:Open Source License

/**
 *
 * @param text// ww  w . j  a  v a  2s  . com
 * @param docId
 * @return
 */
public static Map<String, CustomToken> tokenize(String text, String docId) {
    Map<String, CustomToken> customTokens = new LinkedHashMap<>();
    Annotation document = new Annotation(text);
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    int sentIndex = 0;
    for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        int index = 0;

        final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
        final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en");
        for (TaggedWord token : taggedWords) {
            // this is the text of the token
            String word = token.word();
            // this is the POS tag of the token
            String pos = token.tag();
            // this is the NER label of the token
            //                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            CustomToken ctoken = new CustomToken();
            ctoken.setWord(word);
            ctoken.setBeginPosition(token.beginPosition());
            ctoken.setEndPosition(token.endPosition());
            ctoken.setDocId(docId);
            ctoken.setSentenceIndex(sentIndex);
            ctoken.setMultiword(false);
            ctoken.setIndex(index);
            ctoken.setPos(pos);

            customTokens.put(word + "/" + docId + "/" + sentIndex + "/" + index, ctoken);
            index++;

        }
        sentIndex++;
    }
    return customTokens;
}