Example usage for edu.stanford.nlp.ling TaggedWord TaggedWord

List of usage examples for edu.stanford.nlp.ling TaggedWord TaggedWord

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling TaggedWord TaggedWord.

Prototype

public TaggedWord(Label word, Label tag) 

Source Link

Document

Create a new TaggedWord.

Usage

From source file:at.illecker.storm.commons.preprocessor.Preprocessor.java

License:Apache License

@SuppressWarnings("unchecked")
private <T> List<T> preprocessAccumulator(LinkedList<String> tokens, boolean pretag, List<T> processedTokens) {

    if (tokens.isEmpty()) {
        return processedTokens;
    } else {/*w ww .  ja v  a2  s .  co  m*/
        // remove token from queue
        String token = tokens.removeFirst();

        // identify token
        boolean tokenContainsPunctuation = StringUtils.consitsOfPunctuations(token);
        boolean tokenIsEmoticon = StringUtils.isEmoticon(token);
        boolean tokenIsURL = StringUtils.isURL(token);
        boolean tokenIsNumeric = StringUtils.isNumeric(token);

        // Step 1) Unify Emoticons remove repeating chars
        if ((tokenIsEmoticon) && (!tokenIsURL) && (!tokenIsNumeric)) {
            Matcher m = RegexUtils.TWO_OR_MORE_REPEATING_CHARS_PATTERN.matcher(token);
            if (m.find()) {
                boolean isSpecialEmoticon = m.group(1).equals("^");
                String reducedToken = m.replaceAll("$1");
                if (isSpecialEmoticon) { // keep ^^
                    reducedToken += "^";
                }
                // else {
                // TODO
                // Preprocess token again if there are recursive patterns in it
                // e.g., :):):) -> :):) -> :) Not possible because of Tokenizer
                // tokens.add(0, reducedToken);
                // }
                if (LOGGING) {
                    LOG.info("Unify Emoticon from '" + token + "' to '" + reducedToken + "'");
                }

                if (pretag) {
                    processedTokens.add((T) new TaggedWord(reducedToken, "UH"));
                } else {
                    processedTokens.add((T) reducedToken);
                }
                return preprocessAccumulator(tokens, pretag, processedTokens);
            }
        } else if (tokenContainsPunctuation) {
            // If token is no Emoticon then there is no further
            // preprocessing for punctuations
            if (pretag) {
                processedTokens.add((T) new TaggedWord(token));
            } else {
                processedTokens.add((T) token);
            }
            return preprocessAccumulator(tokens, pretag, processedTokens);
        }

        // identify token
        boolean tokenIsUser = StringUtils.isUser(token);
        boolean tokenIsHashTag = StringUtils.isHashTag(token);
        boolean tokenIsSlang = StringUtils.isSlang(token);
        boolean tokenIsEmail = StringUtils.isEmail(token);
        boolean tokenIsPhone = StringUtils.isPhone(token);
        boolean tokenIsSpecialNumeric = StringUtils.isSpecialNumeric(token);
        boolean tokenIsSeparatedNumeric = StringUtils.isSeparatedNumeric(token);

        // Step 2) Slang Correction
        // TODO prevent slang correction if all UPPERCASE
        // 'FC' to [fruit, cake]
        // 'Ajax' to [Asynchronous, Javascript, and, XML]
        // 'TL' to [dr too, long, didn't, read]
        // S.O.L - SOL - [s**t, outta, luck]
        // 'AC/DC' to 'AC' and 'DC' - 'DC' to [don't, care]
        // TODO update dictionary O/U O/A
        if ((!tokenIsEmoticon) && (!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (!tokenIsNumeric)
                && (!tokenIsSpecialNumeric) && (!tokenIsSeparatedNumeric) && (!tokenIsEmail)
                && (!tokenIsPhone)) {
            String[] slangCorrection = m_slangCorrection.getCorrection(token.toLowerCase());
            if (slangCorrection != null) {
                for (int i = 0; i < slangCorrection.length; i++) {
                    if (pretag) {
                        // PreTagging for POS Tagger
                        TaggedWord preTaggedToken = pretagToken(slangCorrection[i], tokenIsHashTag, tokenIsUser,
                                tokenIsURL);
                        processedTokens.add((T) preTaggedToken);
                    } else {
                        processedTokens.add((T) slangCorrection[i]);
                    }
                }
                if (LOGGING) {
                    LOG.info("Slang Correction from '" + token + "' to " + Arrays.toString(slangCorrection));
                }
                return preprocessAccumulator(tokens, pretag, processedTokens);
            } else if (tokenIsSlang) {
                if (token.startsWith("w/")) {
                    if (pretag) {
                        processedTokens.add((T) new TaggedWord("with"));
                        // PreTagging for POS Tagger
                        TaggedWord preTaggedToken = pretagToken(token.substring(2), tokenIsHashTag, tokenIsUser,
                                tokenIsURL);
                        processedTokens.add((T) preTaggedToken);
                    } else {
                        processedTokens.add((T) "with");
                        processedTokens.add((T) token.substring(2));
                    }
                    if (LOGGING) {
                        LOG.info("Slang Correction from '" + token + "' to " + "[with, " + token.substring(2)
                                + "]");
                    }
                    return preprocessAccumulator(tokens, pretag, processedTokens);
                } else {
                    if (LOGGING) {
                        LOG.info("Slang Correction might be missing for '" + token + "'");
                    }
                }
            }
        }

        // Step 3) Check if there are punctuations between words
        // e.g., L.O.V.E
        if ((!tokenIsEmoticon) && (!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (!tokenIsNumeric)
                && (!tokenIsSpecialNumeric) && (!tokenIsSeparatedNumeric) && (!tokenIsEmail)
                && (!tokenIsPhone)) {
            // remove alternating letter dot pattern e.g., L.O.V.E
            Matcher m = RegexUtils.ALTERNATING_LETTER_DOT_PATTERN.matcher(token);
            if (m.matches()) {
                String newToken = token.replaceAll("\\.", "");
                if (m_wordnet.contains(newToken)) {
                    if (LOGGING) {
                        LOG.info("Remove punctuations in word from '" + token + "' to '" + newToken + "'");
                    }
                    token = newToken;
                    if (pretag) {
                        // PreTagging for POS Tagger
                        TaggedWord preTaggedToken = pretagToken(token, tokenIsHashTag, tokenIsUser, tokenIsURL);
                        processedTokens.add((T) preTaggedToken);
                    } else {
                        processedTokens.add((T) token);
                    }
                    return preprocessAccumulator(tokens, pretag, processedTokens);
                }
            }
        }

        // Step 4) Add missing g in gerund forms e.g., goin
        if ((!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (token.endsWith("in"))
                && (!m_firstNames.isFirstName(token)) && (!m_wordnet.contains(token.toLowerCase()))) {
            // append "g" if a word ends with "in" and is not in the vocabulary
            if (LOGGING) {
                LOG.info("Add missing \"g\" from '" + token + "' to '" + token + "g'");
            }
            token = token + "g";
            if (pretag) {
                // PreTagging for POS Tagger, because it could be a interjection
                TaggedWord preTaggedToken = pretagToken(token, tokenIsHashTag, tokenIsUser, tokenIsURL);
                processedTokens.add((T) preTaggedToken);
            } else {
                processedTokens.add((T) token);
            }
            return preprocessAccumulator(tokens, pretag, processedTokens);
        }

        // Step 5) Remove elongations of characters (suuuper)
        // 'lollll' to 'loll' because 'loll' is found in dict
        // TODO 'AHHHHH' to 'AH'
        if ((!tokenIsEmoticon) && (!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (!tokenIsNumeric)
                && (!tokenIsSpecialNumeric) && (!tokenIsSeparatedNumeric) && (!tokenIsEmail)
                && (!tokenIsPhone)) {

            // remove repeating chars
            token = removeRepeatingChars(token);

            // Step 5b) Try Slang Correction again
            String[] slangCorrection = m_slangCorrection.getCorrection(token.toLowerCase());
            if (slangCorrection != null) {
                for (int i = 0; i < slangCorrection.length; i++) {
                    if (pretag) {
                        // PreTagging for POS Tagger
                        TaggedWord preTaggedToken = pretagToken(slangCorrection[i], tokenIsHashTag, tokenIsUser,
                                tokenIsURL);
                        processedTokens.add((T) preTaggedToken);
                    } else {
                        processedTokens.add((T) slangCorrection[i]);
                    }
                }
                if (LOGGING) {
                    LOG.info("Slang Correction from '" + token + "' to " + Arrays.toString(slangCorrection));
                }
                return preprocessAccumulator(tokens, pretag, processedTokens);
            }
        }

        // add token to processed list
        if (pretag) {
            // PreTagging for POS Tagger
            TaggedWord preTaggedToken = pretagToken(token, tokenIsHashTag, tokenIsUser, tokenIsURL);
            processedTokens.add((T) preTaggedToken);
        } else {
            processedTokens.add((T) token);
        }
        return preprocessAccumulator(tokens, pretag, processedTokens);
    }
}

From source file:cc.clabs.stratosphere.mlp.types.PactWord.java

License:BEER-WARE LICENSE

/**
 * Returns this PactWord as a TaggedWord from the Stanford
 * NLP Project (@see edu.stanford.nlp.ling.TaggedWord).
 * /*from ww w .  ja  v a2  s  . co m*/
 * @return a TaggedWord
 */
public TaggedWord getTaggedWord() {
    return new TaggedWord(word.getValue(), tag.getValue());
}

From source file:edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParserModel.java

License:Open Source License

public void parseSentence(TextRange sentenceLabel, LabelIndex<ParseToken> parseTokenLabelIndex,
        LabelIndex<PosTag> partOfSpeechLabelIndex, Labeler<ConstituencyParse> constituencyParseLabeler) {
    List<TaggedWord> taggedWordList = new ArrayList<>();
    for (ParseToken parseTokenLabel : parseTokenLabelIndex.inside(sentenceLabel)) {
        String word = parseTokenLabel.getText();
        PartOfSpeech partOfSpeech = partOfSpeechLabelIndex.firstAtLocation(parseTokenLabel).getPartOfSpeech();

        TaggedWord taggedWord = new TaggedWord(word, PartsOfSpeech.tagForPartOfSpeech(partOfSpeech));
        taggedWordList.add(taggedWord);/*  w  w w  .  ja va2 s  .  com*/
    }
    Tree tree = shiftReduceParser.apply(taggedWordList);
    StringWriter stringWriter = new StringWriter();
    tree.pennPrint(new PrintWriter(stringWriter));
    String pennPrint = stringWriter.toString();
    ConstituencyParse constituencyParse = new ConstituencyParse(sentenceLabel, pennPrint);
    constituencyParseLabeler.add(constituencyParse);
}

From source file:edu.umn.biomedicus.gpl.stanford.parser.StanfordDependencyParserModel.java

License:Open Source License

public GrammaticalStructure parseToGrammaticalStructure(List<ParseToken> tokens, List<PosTag> posTags) {
    int size = tokens.size();
    List<TaggedWord> taggedWordList = new ArrayList<>(size);
    for (int i = 0; i < size; i++) {
        ParseToken parseToken = tokens.get(i);
        PosTag posTag = posTags.get(i);//from   w  ww.j  a  v  a 2s . co m
        TaggedWord taggedWord = new TaggedWord(parseToken.getText(),
                PartsOfSpeech.tagForPartOfSpeech(posTag.getPartOfSpeech()));
        taggedWordList.add(taggedWord);
    }
    return parser.predict(taggedWordList);
}

From source file:gate.stanford.StanfordSentence.java

License:Open Source License

public StanfordSentence(Annotation sentence, String tokenType, AnnotationSet inputAS, boolean usePosTags) {

    startPosToOffset = new HashMap<Integer, Long>();
    endPosToOffset = new HashMap<Integer, Long>();
    startPosToToken = new HashMap<Integer, Annotation>();
    startPosToString = new HashMap<Integer, String>();

    sentenceStartOffset = sentence.getStartNode().getOffset();
    sentenceEndOffset = sentence.getEndNode().getOffset();

    nbrOfTokens = 0;//from   w  ww  .  j a  va2  s .c  om
    nbrOfMissingPosTags = 0;

    tokens = Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType));
    words = new ArrayList<Word>();

    add(-1, sentence, "S");

    int tokenNo = 0;

    for (Annotation token : tokens) {
        String tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString());
        add(tokenNo, token, tokenString);

        /* The FAQ says the parser will automatically use existing POS tags
         * if the List elements are of type TaggedWord.  
         * http://nlp.stanford.edu/software/parser-faq.shtml#f
         */

        if (usePosTags) {
            words.add(new TaggedWord(tokenString, getEscapedPosTag(token)));
        } else {
            words.add(new Word(tokenString));
        }

        tokenNo++;
    }

    nbrOfTokens = tokenNo;
}

From source file:gate.stanford.Tagger.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);

    if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Token Annotation Type provided!");
    }/*from   ww  w . ja  v a2s . c  o  m*/

    if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Sentence Annotation Type provided!");
    }

    if (outputAnnotationType == null || outputAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No AnnotationType provided to store the new feature!");
    }

    AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
    AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
    if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) {
        long startTime = System.currentTimeMillis();
        fireStatusChanged("POS tagging " + document.getName());
        fireProgressChanged(0);
        // prepare the input for MaxentTagger
        List<Word> sentenceForTagger = new ArrayList<Word>();

        // define a comparator for annotations by start offset
        OffsetComparator offsetComparator = new OffsetComparator();

        // read all the tokens and all the sentences
        List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
        Collections.sort(sentencesList, offsetComparator);
        List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
        Collections.sort(tokensList, offsetComparator);

        Iterator<Annotation> sentencesIter = sentencesList.iterator();
        ListIterator<Annotation> tokensIter = tokensList.listIterator();

        List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
        Annotation currentToken = tokensIter.next();
        int sentIndex = 0;
        int sentCnt = sentencesAS.size();
        while (sentencesIter.hasNext()) {
            Annotation currentSentence = sentencesIter.next();
            tokensInCurrentSentence.clear();
            sentenceForTagger.clear();
            while (currentToken != null && currentToken.getEndNode().getOffset()
                    .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {
                // If we're only POS tagging Tokens within baseSentenceAnnotationType,
                // don't add the sentence if the Tokens aren't within the span of
                // baseSentenceAnnotationType
                if (posTagAllTokens || currentToken.withinSpanOf(currentSentence)) {
                    tokensInCurrentSentence.add(currentToken);

                    if (useExistingTags
                            && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
                        sentenceForTagger.add(new TaggedWord(
                                (String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME),
                                (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)));
                    } else {
                        sentenceForTagger.add(
                                new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)));
                    }
                }
                currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
            }

            // if the sentence doesn't contain any tokens (which is a bit weird but
            // is possible) then don't try running the POS tagger as you will get an
            // array index out of bounds exception
            if (sentenceForTagger.isEmpty())
                continue;

            // run the POS tagger
            ArrayList<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags);

            // add the results
            // make sure no malfunction occurred
            if (taggerResults.size() != tokensInCurrentSentence.size())
                throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size()
                        + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!");
            Iterator<TaggedWord> resIter = taggerResults.iterator();
            Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
            while (resIter.hasNext()) {
                Annotation annot = tokIter.next();
                addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag()));
            }
            fireProgressChanged(sentIndex++ * 100 / sentCnt);
        } // while(sentencesIter.hasNext())

        if (currentToken != null && posTagAllTokens) {
            // Tag remaining Tokens if we are not considering those only within
            // baseSentenceAnnotationType

            // we have remaining tokens after the last sentence
            tokensInCurrentSentence.clear();
            sentenceForTagger.clear();
            while (currentToken != null) {
                tokensInCurrentSentence.add(currentToken);
                if (useExistingTags && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
                    sentenceForTagger.add(
                            new TaggedWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME),
                                    (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)));
                } else {
                    sentenceForTagger
                            .add(new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)));
                }
                currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
            }

            // run the POS tagger on remaining tokens
            List<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags);

            // add the results and make sure no malfunction occurred
            if (taggerResults.size() != tokensInCurrentSentence.size())
                throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size()
                        + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!");
            Iterator<TaggedWord> resIter = taggerResults.iterator();
            Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
            while (resIter.hasNext()) {
                Annotation annot = tokIter.next();
                addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag()));
            }
        } // if(currentToken != null)
        fireProcessFinished();
        fireStatusChanged(document.getName() + " tagged in "
                + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
                + " seconds!");
    } else {
        if (failOnMissingInputAnnotations) {
            throw new ExecutionException("No sentences or tokens to process in document " + document.getName()
                    + "\n" + "Please run a sentence splitter " + "and tokeniser first!");
        } else {
            Utils.logOnce(logger, Level.INFO,
                    "POS tagger: no sentence or token annotations in input document - see debug log for details.");
            logger.debug("No input annotations in document " + document.getName());
        }
    }

}

From source file:gov.llnl.ontology.text.parse.StanfordParser.java

License:Open Source License

/**
 * {@inheritDoc}/* w  w  w .  j  av a2 s .  c  o  m*/
 */
public DependencyTreeNode[] parseText(String header, StringPair[] sentence) {
    List<HasWord> tokens = Lists.newArrayList();
    for (StringPair word : sentence)
        if (word.x != null && word.y != null)
            tokens.add(new TaggedWord(word.x, word.y));
    return parseTokens(header, tokens).toArray(new DependencyTreeNode[0]);
}

From source file:it.cnr.jatecs.nlp.patterns.Parser.java

License:Open Source License

/**
 * L'idea  che un inciso sia una frase che inizia con un pronome e sta tra due ,
 *///w  w  w.  j  a  v  a2s .  c  om
private Vector<ArrayList<TaggedWord>> findAppositives(ArrayList<TaggedWord> sentence) {
    boolean foundFirst = false;
    ArrayList<TaggedWord> mainSentence = new ArrayList<TaggedWord>();
    ArrayList<TaggedWord> temp = new ArrayList<TaggedWord>();
    Vector<ArrayList<TaggedWord>> sentences = new Vector<ArrayList<TaggedWord>>();
    for (Iterator<TaggedWord> iterator = sentence.iterator(); iterator.hasNext();) {
        TaggedWord taggedWord = (TaggedWord) iterator.next();
        //Trattamento speciale delle foreign word
        if (taggedWord.tag().startsWith("FW"))
            taggedWord.setTag("NN");
        if (foundFirst) {
            if (taggedWord.tag().equals(",")) {
                foundFirst = false;
                sentences.add(temp);
                temp = new ArrayList<TaggedWord>();
            } else
                temp.add(taggedWord);
        } else if (taggedWord.tag().equals(",") && iterator.hasNext()) {
            taggedWord = (TaggedWord) iterator.next();
            if (taggedWord.tag().startsWith("W") || taggedWord.tag().startsWith("PRP")) {
                foundFirst = true;
                temp.add(taggedWord);
            } else {
                mainSentence.add(new TaggedWord(",", ","));
                mainSentence.add(taggedWord);
            }
        } else
            mainSentence.add(taggedWord);
    }
    if (foundFirst) {
        mainSentence.add(new TaggedWord(",", ","));
        mainSentence.addAll(temp);
    }
    sentences.add(mainSentence);
    return sentences;
}

From source file:org.ets.research.nlp.stanford_thrift.general.CoreNLPThriftUtil.java

License:Open Source License

public static List<TaggedWord> convertTaggedTokensToTaggedWords(List<TaggedToken> taggedSentence) {
    List<TaggedWord> taggedSentenceWords = new ArrayList<TaggedWord>();
    for (TaggedToken tt : taggedSentence) {
        taggedSentenceWords.add(new TaggedWord(tt.token, tt.tag));
    }//from w w  w. j  a v  a 2 s .co  m
    return taggedSentenceWords;
}

From source file:qmul.corpus.BNCCorpus.java

License:Open Source License

private void getTagAndHeadWord(Node node, String w, ArrayList<TaggedWord> taggedWords,
        ArrayList<TaggedWord> taggedLemmas) {
    if (node.getAttributes().getNamedItem("c5") != null) {
        String tag = node.getAttributes().getNamedItem("c5").getNodeValue();
        taggedWords.add(new TaggedWord(w.trim(), tag));
        if (node.getAttributes().getNamedItem("hw") != null) {
            String hw = node.getAttributes().getNamedItem("hw").getNodeValue();
            taggedLemmas.add(new TaggedWord(hw.trim(), tag));
        }/*from  www  .j  a  v a  2s. co m*/
    }
}