List of usage examples for edu.stanford.nlp.ling TaggedWord TaggedWord
public TaggedWord(Label word, Label tag)
TaggedWord. From source file:at.illecker.storm.commons.preprocessor.Preprocessor.java
License:Apache License
@SuppressWarnings("unchecked") private <T> List<T> preprocessAccumulator(LinkedList<String> tokens, boolean pretag, List<T> processedTokens) { if (tokens.isEmpty()) { return processedTokens; } else {/*w ww . ja v a2 s . co m*/ // remove token from queue String token = tokens.removeFirst(); // identify token boolean tokenContainsPunctuation = StringUtils.consitsOfPunctuations(token); boolean tokenIsEmoticon = StringUtils.isEmoticon(token); boolean tokenIsURL = StringUtils.isURL(token); boolean tokenIsNumeric = StringUtils.isNumeric(token); // Step 1) Unify Emoticons remove repeating chars if ((tokenIsEmoticon) && (!tokenIsURL) && (!tokenIsNumeric)) { Matcher m = RegexUtils.TWO_OR_MORE_REPEATING_CHARS_PATTERN.matcher(token); if (m.find()) { boolean isSpecialEmoticon = m.group(1).equals("^"); String reducedToken = m.replaceAll("$1"); if (isSpecialEmoticon) { // keep ^^ reducedToken += "^"; } // else { // TODO // Preprocess token again if there are recursive patterns in it // e.g., :):):) -> :):) -> :) Not possible because of Tokenizer // tokens.add(0, reducedToken); // } if (LOGGING) { LOG.info("Unify Emoticon from '" + token + "' to '" + reducedToken + "'"); } if (pretag) { processedTokens.add((T) new TaggedWord(reducedToken, "UH")); } else { processedTokens.add((T) reducedToken); } return preprocessAccumulator(tokens, pretag, processedTokens); } } else if (tokenContainsPunctuation) { // If token is no Emoticon then there is no further // preprocessing for punctuations if (pretag) { processedTokens.add((T) new TaggedWord(token)); } else { processedTokens.add((T) token); } return preprocessAccumulator(tokens, pretag, processedTokens); } // identify token boolean tokenIsUser = StringUtils.isUser(token); boolean tokenIsHashTag = StringUtils.isHashTag(token); boolean tokenIsSlang = StringUtils.isSlang(token); boolean tokenIsEmail = StringUtils.isEmail(token); boolean tokenIsPhone = StringUtils.isPhone(token); boolean tokenIsSpecialNumeric = StringUtils.isSpecialNumeric(token); boolean tokenIsSeparatedNumeric = StringUtils.isSeparatedNumeric(token); // Step 2) Slang Correction // TODO prevent slang correction if all UPPERCASE // 'FC' to [fruit, cake] // 'Ajax' to [Asynchronous, Javascript, and, XML] // 'TL' to [dr too, long, didn't, read] // S.O.L - SOL - [s**t, outta, luck] // 'AC/DC' to 'AC' and 'DC' - 'DC' to [don't, care] // TODO update dictionary O/U O/A if ((!tokenIsEmoticon) && (!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (!tokenIsNumeric) && (!tokenIsSpecialNumeric) && (!tokenIsSeparatedNumeric) && (!tokenIsEmail) && (!tokenIsPhone)) { String[] slangCorrection = m_slangCorrection.getCorrection(token.toLowerCase()); if (slangCorrection != null) { for (int i = 0; i < slangCorrection.length; i++) { if (pretag) { // PreTagging for POS Tagger TaggedWord preTaggedToken = pretagToken(slangCorrection[i], tokenIsHashTag, tokenIsUser, tokenIsURL); processedTokens.add((T) preTaggedToken); } else { processedTokens.add((T) slangCorrection[i]); } } if (LOGGING) { LOG.info("Slang Correction from '" + token + "' to " + Arrays.toString(slangCorrection)); } return preprocessAccumulator(tokens, pretag, processedTokens); } else if (tokenIsSlang) { if (token.startsWith("w/")) { if (pretag) { processedTokens.add((T) new TaggedWord("with")); // PreTagging for POS Tagger TaggedWord preTaggedToken = pretagToken(token.substring(2), tokenIsHashTag, tokenIsUser, tokenIsURL); processedTokens.add((T) preTaggedToken); } else { processedTokens.add((T) "with"); processedTokens.add((T) token.substring(2)); } if (LOGGING) { LOG.info("Slang Correction from '" + token + "' to " + "[with, " + token.substring(2) + "]"); } return preprocessAccumulator(tokens, pretag, processedTokens); } else { if (LOGGING) { LOG.info("Slang Correction might be missing for '" + token + "'"); } } } } // Step 3) Check if there are punctuations between words // e.g., L.O.V.E if ((!tokenIsEmoticon) && (!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (!tokenIsNumeric) && (!tokenIsSpecialNumeric) && (!tokenIsSeparatedNumeric) && (!tokenIsEmail) && (!tokenIsPhone)) { // remove alternating letter dot pattern e.g., L.O.V.E Matcher m = RegexUtils.ALTERNATING_LETTER_DOT_PATTERN.matcher(token); if (m.matches()) { String newToken = token.replaceAll("\\.", ""); if (m_wordnet.contains(newToken)) { if (LOGGING) { LOG.info("Remove punctuations in word from '" + token + "' to '" + newToken + "'"); } token = newToken; if (pretag) { // PreTagging for POS Tagger TaggedWord preTaggedToken = pretagToken(token, tokenIsHashTag, tokenIsUser, tokenIsURL); processedTokens.add((T) preTaggedToken); } else { processedTokens.add((T) token); } return preprocessAccumulator(tokens, pretag, processedTokens); } } } // Step 4) Add missing g in gerund forms e.g., goin if ((!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (token.endsWith("in")) && (!m_firstNames.isFirstName(token)) && (!m_wordnet.contains(token.toLowerCase()))) { // append "g" if a word ends with "in" and is not in the vocabulary if (LOGGING) { LOG.info("Add missing \"g\" from '" + token + "' to '" + token + "g'"); } token = token + "g"; if (pretag) { // PreTagging for POS Tagger, because it could be a interjection TaggedWord preTaggedToken = pretagToken(token, tokenIsHashTag, tokenIsUser, tokenIsURL); processedTokens.add((T) preTaggedToken); } else { processedTokens.add((T) token); } return preprocessAccumulator(tokens, pretag, processedTokens); } // Step 5) Remove elongations of characters (suuuper) // 'lollll' to 'loll' because 'loll' is found in dict // TODO 'AHHHHH' to 'AH' if ((!tokenIsEmoticon) && (!tokenIsUser) && (!tokenIsHashTag) && (!tokenIsURL) && (!tokenIsNumeric) && (!tokenIsSpecialNumeric) && (!tokenIsSeparatedNumeric) && (!tokenIsEmail) && (!tokenIsPhone)) { // remove repeating chars token = removeRepeatingChars(token); // Step 5b) Try Slang Correction again String[] slangCorrection = m_slangCorrection.getCorrection(token.toLowerCase()); if (slangCorrection != null) { for (int i = 0; i < slangCorrection.length; i++) { if (pretag) { // PreTagging for POS Tagger TaggedWord preTaggedToken = pretagToken(slangCorrection[i], tokenIsHashTag, tokenIsUser, tokenIsURL); processedTokens.add((T) preTaggedToken); } else { processedTokens.add((T) slangCorrection[i]); } } if (LOGGING) { LOG.info("Slang Correction from '" + token + "' to " + Arrays.toString(slangCorrection)); } return preprocessAccumulator(tokens, pretag, processedTokens); } } // add token to processed list if (pretag) { // PreTagging for POS Tagger TaggedWord preTaggedToken = pretagToken(token, tokenIsHashTag, tokenIsUser, tokenIsURL); processedTokens.add((T) preTaggedToken); } else { processedTokens.add((T) token); } return preprocessAccumulator(tokens, pretag, processedTokens); } }
From source file:cc.clabs.stratosphere.mlp.types.PactWord.java
License:BEER-WARE LICENSE
/** * Returns this PactWord as a TaggedWord from the Stanford * NLP Project (@see edu.stanford.nlp.ling.TaggedWord). * /*from ww w . ja v a2 s . co m*/ * @return a TaggedWord */ public TaggedWord getTaggedWord() { return new TaggedWord(word.getValue(), tag.getValue()); }
From source file:edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParserModel.java
License:Open Source License
public void parseSentence(TextRange sentenceLabel, LabelIndex<ParseToken> parseTokenLabelIndex, LabelIndex<PosTag> partOfSpeechLabelIndex, Labeler<ConstituencyParse> constituencyParseLabeler) { List<TaggedWord> taggedWordList = new ArrayList<>(); for (ParseToken parseTokenLabel : parseTokenLabelIndex.inside(sentenceLabel)) { String word = parseTokenLabel.getText(); PartOfSpeech partOfSpeech = partOfSpeechLabelIndex.firstAtLocation(parseTokenLabel).getPartOfSpeech(); TaggedWord taggedWord = new TaggedWord(word, PartsOfSpeech.tagForPartOfSpeech(partOfSpeech)); taggedWordList.add(taggedWord);/* w w w . ja va2 s . com*/ } Tree tree = shiftReduceParser.apply(taggedWordList); StringWriter stringWriter = new StringWriter(); tree.pennPrint(new PrintWriter(stringWriter)); String pennPrint = stringWriter.toString(); ConstituencyParse constituencyParse = new ConstituencyParse(sentenceLabel, pennPrint); constituencyParseLabeler.add(constituencyParse); }
From source file:edu.umn.biomedicus.gpl.stanford.parser.StanfordDependencyParserModel.java
License:Open Source License
public GrammaticalStructure parseToGrammaticalStructure(List<ParseToken> tokens, List<PosTag> posTags) { int size = tokens.size(); List<TaggedWord> taggedWordList = new ArrayList<>(size); for (int i = 0; i < size; i++) { ParseToken parseToken = tokens.get(i); PosTag posTag = posTags.get(i);//from w ww.j a v a 2s . co m TaggedWord taggedWord = new TaggedWord(parseToken.getText(), PartsOfSpeech.tagForPartOfSpeech(posTag.getPartOfSpeech())); taggedWordList.add(taggedWord); } return parser.predict(taggedWordList); }
From source file:gate.stanford.StanfordSentence.java
License:Open Source License
public StanfordSentence(Annotation sentence, String tokenType, AnnotationSet inputAS, boolean usePosTags) { startPosToOffset = new HashMap<Integer, Long>(); endPosToOffset = new HashMap<Integer, Long>(); startPosToToken = new HashMap<Integer, Annotation>(); startPosToString = new HashMap<Integer, String>(); sentenceStartOffset = sentence.getStartNode().getOffset(); sentenceEndOffset = sentence.getEndNode().getOffset(); nbrOfTokens = 0;//from w ww . j a va2 s .c om nbrOfMissingPosTags = 0; tokens = Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType)); words = new ArrayList<Word>(); add(-1, sentence, "S"); int tokenNo = 0; for (Annotation token : tokens) { String tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString()); add(tokenNo, token, tokenString); /* The FAQ says the parser will automatically use existing POS tags * if the List elements are of type TaggedWord. * http://nlp.stanford.edu/software/parser-faq.shtml#f */ if (usePosTags) { words.add(new TaggedWord(tokenString, getEscapedPosTag(token))); } else { words.add(new Word(tokenString)); } tokenNo++; } nbrOfTokens = tokenNo; }
From source file:gate.stanford.Tagger.java
License:Open Source License
@Override public void execute() throws ExecutionException { // check the parameters if (document == null) throw new ExecutionException("No document to process!"); AnnotationSet inputAS = document.getAnnotations(inputASName); if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException("No base Token Annotation Type provided!"); }/*from ww w . ja v a2s . c o m*/ if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException("No base Sentence Annotation Type provided!"); } if (outputAnnotationType == null || outputAnnotationType.trim().length() == 0) { throw new ExecutionException("No AnnotationType provided to store the new feature!"); } AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType); AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType); if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) { long startTime = System.currentTimeMillis(); fireStatusChanged("POS tagging " + document.getName()); fireProgressChanged(0); // prepare the input for MaxentTagger List<Word> sentenceForTagger = new ArrayList<Word>(); // define a comparator for annotations by start offset OffsetComparator offsetComparator = new OffsetComparator(); // read all the tokens and all the sentences List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS); Collections.sort(sentencesList, offsetComparator); List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS); Collections.sort(tokensList, offsetComparator); Iterator<Annotation> sentencesIter = sentencesList.iterator(); ListIterator<Annotation> tokensIter = tokensList.listIterator(); List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>(); Annotation currentToken = tokensIter.next(); int sentIndex = 0; int sentCnt = sentencesAS.size(); while (sentencesIter.hasNext()) { Annotation currentSentence = sentencesIter.next(); tokensInCurrentSentence.clear(); sentenceForTagger.clear(); while (currentToken != null && currentToken.getEndNode().getOffset() .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { // If we're only POS tagging Tokens within baseSentenceAnnotationType, // don't add the sentence if the Tokens aren't within the span of // baseSentenceAnnotationType if (posTagAllTokens || currentToken.withinSpanOf(currentSentence)) { tokensInCurrentSentence.add(currentToken); if (useExistingTags && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) { sentenceForTagger.add(new TaggedWord( (String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME), (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME))); } else { sentenceForTagger.add( new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME))); } } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } // if the sentence doesn't contain any tokens (which is a bit weird but // is possible) then don't try running the POS tagger as you will get an // array index out of bounds exception if (sentenceForTagger.isEmpty()) continue; // run the POS tagger ArrayList<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags); // add the results // make sure no malfunction occurred if (taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size() + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!"); Iterator<TaggedWord> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); while (resIter.hasNext()) { Annotation annot = tokIter.next(); addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag())); } fireProgressChanged(sentIndex++ * 100 / sentCnt); } // while(sentencesIter.hasNext()) if (currentToken != null && posTagAllTokens) { // Tag remaining Tokens if we are not considering those only within // baseSentenceAnnotationType // we have remaining tokens after the last sentence tokensInCurrentSentence.clear(); sentenceForTagger.clear(); while (currentToken != null) { tokensInCurrentSentence.add(currentToken); if (useExistingTags && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) { sentenceForTagger.add( new TaggedWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME), (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME))); } else { sentenceForTagger .add(new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME))); } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } // run the POS tagger on remaining tokens List<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags); // add the results and make sure no malfunction occurred if (taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size() + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!"); Iterator<TaggedWord> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); while (resIter.hasNext()) { Annotation annot = tokIter.next(); addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag())); } } // if(currentToken != null) fireProcessFinished(); fireStatusChanged(document.getName() + " tagged in " + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000) + " seconds!"); } else { if (failOnMissingInputAnnotations) { throw new ExecutionException("No sentences or tokens to process in document " + document.getName() + "\n" + "Please run a sentence splitter " + "and tokeniser first!"); } else { Utils.logOnce(logger, Level.INFO, "POS tagger: no sentence or token annotations in input document - see debug log for details."); logger.debug("No input annotations in document " + document.getName()); } } }
From source file:gov.llnl.ontology.text.parse.StanfordParser.java
License:Open Source License
/** * {@inheritDoc}/* w w w . j av a2 s . c o m*/ */ public DependencyTreeNode[] parseText(String header, StringPair[] sentence) { List<HasWord> tokens = Lists.newArrayList(); for (StringPair word : sentence) if (word.x != null && word.y != null) tokens.add(new TaggedWord(word.x, word.y)); return parseTokens(header, tokens).toArray(new DependencyTreeNode[0]); }
From source file:it.cnr.jatecs.nlp.patterns.Parser.java
License:Open Source License
/** * L'idea che un inciso sia una frase che inizia con un pronome e sta tra due , *///w w w. j a v a2s . c om private Vector<ArrayList<TaggedWord>> findAppositives(ArrayList<TaggedWord> sentence) { boolean foundFirst = false; ArrayList<TaggedWord> mainSentence = new ArrayList<TaggedWord>(); ArrayList<TaggedWord> temp = new ArrayList<TaggedWord>(); Vector<ArrayList<TaggedWord>> sentences = new Vector<ArrayList<TaggedWord>>(); for (Iterator<TaggedWord> iterator = sentence.iterator(); iterator.hasNext();) { TaggedWord taggedWord = (TaggedWord) iterator.next(); //Trattamento speciale delle foreign word if (taggedWord.tag().startsWith("FW")) taggedWord.setTag("NN"); if (foundFirst) { if (taggedWord.tag().equals(",")) { foundFirst = false; sentences.add(temp); temp = new ArrayList<TaggedWord>(); } else temp.add(taggedWord); } else if (taggedWord.tag().equals(",") && iterator.hasNext()) { taggedWord = (TaggedWord) iterator.next(); if (taggedWord.tag().startsWith("W") || taggedWord.tag().startsWith("PRP")) { foundFirst = true; temp.add(taggedWord); } else { mainSentence.add(new TaggedWord(",", ",")); mainSentence.add(taggedWord); } } else mainSentence.add(taggedWord); } if (foundFirst) { mainSentence.add(new TaggedWord(",", ",")); mainSentence.addAll(temp); } sentences.add(mainSentence); return sentences; }
From source file:org.ets.research.nlp.stanford_thrift.general.CoreNLPThriftUtil.java
License:Open Source License
public static List<TaggedWord> convertTaggedTokensToTaggedWords(List<TaggedToken> taggedSentence) { List<TaggedWord> taggedSentenceWords = new ArrayList<TaggedWord>(); for (TaggedToken tt : taggedSentence) { taggedSentenceWords.add(new TaggedWord(tt.token, tt.tag)); }//from w w w. j a v a 2 s .co m return taggedSentenceWords; }
From source file:qmul.corpus.BNCCorpus.java
License:Open Source License
private void getTagAndHeadWord(Node node, String w, ArrayList<TaggedWord> taggedWords, ArrayList<TaggedWord> taggedLemmas) { if (node.getAttributes().getNamedItem("c5") != null) { String tag = node.getAttributes().getNamedItem("c5").getNodeValue(); taggedWords.add(new TaggedWord(w.trim(), tag)); if (node.getAttributes().getNamedItem("hw") != null) { String hw = node.getAttributes().getNamedItem("hw").getNodeValue(); taggedLemmas.add(new TaggedWord(hw.trim(), tag)); }/*from www .j a v a 2s. co m*/ } }