List of usage examples for edu.stanford.nlp.ling Word Word
public Word(Label lab)
From source file:DependencyParse.java
License:Apache License
public static void main(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(args); if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) { System.err.println(//from w w w .java2 s . c om "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>"); System.exit(1); } boolean tokenize = false; if (props.containsKey("tokenize")) { tokenize = true; } String tokPath = props.getProperty("tokpath"); String parentPath = props.getProperty("parentpath"); String relPath = props.getProperty("relpath"); BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath)); BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath)); BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath)); MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL); DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL); Scanner stdin = new Scanner(System.in); int count = 0; long start = System.currentTimeMillis(); while (stdin.hasNextLine()) { String line = stdin.nextLine(); List<HasWord> tokens = new ArrayList<>(); if (tokenize) { PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); for (Word label; tokenizer.hasNext();) { tokens.add(tokenizer.next()); } } else { for (String word : line.split(" ")) { tokens.add(new Word(word)); } } List<TaggedWord> tagged = tagger.tagSentence(tokens); int len = tagged.size(); Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies(); int[] parents = new int[len]; for (int i = 0; i < len; i++) { // if a node has a parent of -1 at the end of parsing, then the node // has no parent. parents[i] = -1; } String[] relns = new String[len]; for (TypedDependency td : tdl) { // let root have index 0 int child = td.dep().index(); int parent = td.gov().index(); relns[child - 1] = td.reln().toString(); parents[child - 1] = parent; } // print tokens StringBuilder sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { if (tokenize) { sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word())); } else { sb.append(tokens.get(i).word()); } sb.append(' '); } if (tokenize) { sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word())); } else { sb.append(tokens.get(len - 1).word()); } sb.append('\n'); tokWriter.write(sb.toString()); // print parent pointers sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { sb.append(parents[i]); sb.append(' '); } sb.append(parents[len - 1]); sb.append('\n'); parentWriter.write(sb.toString()); // print relations sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { sb.append(relns[i]); sb.append(' '); } sb.append(relns[len - 1]); sb.append('\n'); relWriter.write(sb.toString()); count++; if (count % 1000 == 0) { double elapsed = (System.currentTimeMillis() - start) / 1000.0; System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed); } } long totalTimeMillis = System.currentTimeMillis() - start; System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count); tokWriter.close(); parentWriter.close(); relWriter.close(); }
From source file:ConstituencyParse.java
License:Apache License
public List<HasWord> sentenceToTokens(String line) { List<HasWord> tokens = new ArrayList<>(); if (tokenize) { PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); for (Word label; tokenizer.hasNext();) { tokens.add(tokenizer.next()); }// www . ja va2s . co m } else { for (String word : line.split(" ")) { tokens.add(new Word(word)); } } return tokens; }
From source file:com.daemon.sentiment.FeatureMatrix.java
License:Open Source License
/** * POS tagging features//from w w w .j av a 2 s. co m * * Words are tagged with their respective part-of-speech tag as determined * by the Stanford parser * * @param tokens * Tokenized text of the tweet * @param tokensPOSTagged * Tokenized text of the tweet, possibly with negations from the * previous step * @return Reference to the second parameter, which now has POS annotations, * e.g. "love $NN$" */ private List<String> addPOSTags(List<String> tokens, List<String> tokensPOSTagged) { Tree stanfordTree; // Parser needs the tokens-list in a HasWord format List<HasWord> sentence = new ArrayList<HasWord>(); for (String token : tokens) { sentence.add(new Word(token)); } // Parse the sentence stanfordTree = lexicalizedParser.apply(sentence); // add results (POS tags) in tokensPOSTagged-list int i = 0; for (CoreLabel label : stanfordTree.taggedLabeledYield()) { tokensPOSTagged.set(i, tokensPOSTagged.get(i) + " $" + label.toString("value") + "$"); i++; } return tokensPOSTagged; }
From source file:edu.iastate.airl.semtus.parser.Parser.java
License:Open Source License
/** * Get morphology base/* w ww . jav a 2 s.co m*/ * * @param thisString * string * @return morphology base */ static public String morphology(String thisString) { Word thisWord = new Word(thisString); return morphology(thisWord); }
From source file:edu.iastate.airl.semtus.processor.InputProcessor.java
License:Open Source License
public static ArrayList<Sentence<Word>> getSentences(final String[] params) { String processedInput = params[1].replaceAll("[^a-zA-Z0-9.?&\t\n\r\b:; ]", ""); processedInput = processedInput.replaceAll("&", "and"); processedInput = processedInput.replaceAll("[:;]", "."); processedInput = processedInput.replaceAll("\t\n\r\b", " "); StringTokenizer tokenizer = new StringTokenizer(processedInput, ".?"); ArrayList<Sentence<Word>> sentenceList = new ArrayList<Sentence<Word>>(); String sentence;// w ww . j av a2s.c om while (tokenizer.hasMoreTokens()) { sentence = tokenizer.nextToken(); if (sentence == null || sentence.trim().equals("") == true) { continue; } StringTokenizer wordTokenizer = new StringTokenizer(sentence, " "); Sentence<Word> sent = new Sentence<Word>(); while (wordTokenizer.hasMoreTokens()) { sent.add(new Word(wordTokenizer.nextToken())); } sentenceList.add(sent); } return sentenceList; }
From source file:edu.washington.phrasal.feature.SentenceIdPhrasalVerbId.java
private List<String> tagWordTokens(List<String> tokens) { List<HasWord> hw = tokens.stream().map((t) -> new Word(t)).collect(Collectors.toList()); /* convert tag back to string */ return fg.getTagger().apply(hw).stream().map((pos) -> pos.tag()).collect(Collectors.toList()); }
From source file:elkfed.mmax.pipeline.StanfordParser.java
License:Apache License
/** Adds a parse tree to forest for each sentence in the document */ protected void annotateDocument() { String[][] sentences = null;/*w ww . jav a 2 s .c o m*/ try { sentences = DiscourseUtils.getSentenceTokens(currentDocument); } catch (Exception mmax2e) { mmax2e.printStackTrace(); } for (int sentence = 0; sentence < sentences.length; sentence++) { List<Word> words = new ArrayList<Word>(); String[] tempSent = new String[sentences[sentence].length]; int i = 0; for (String tok : sentences[sentence]) { String s = tok.replaceAll("\\(", "-LRB-"); s = s.replaceAll("\\)", "-RRB-"); words.add(new Word(s)); } Tree parse = (Tree) lp.apply(words); forest.add(normalizeTree(parse)); } }
From source file:fyp_backend.Stemmer.java
/** * Stems <code>w</code> and returns stemmed <code>Word</code>. */ public Word stem(Word w) { return (new Word(stem(w.word()))); }
From source file:gate.stanford.StanfordSentence.java
License:Open Source License
public StanfordSentence(Annotation sentence, String tokenType, AnnotationSet inputAS, boolean usePosTags) { startPosToOffset = new HashMap<Integer, Long>(); endPosToOffset = new HashMap<Integer, Long>(); startPosToToken = new HashMap<Integer, Annotation>(); startPosToString = new HashMap<Integer, String>(); sentenceStartOffset = sentence.getStartNode().getOffset(); sentenceEndOffset = sentence.getEndNode().getOffset(); nbrOfTokens = 0;// ww w . ja va2 s . co m nbrOfMissingPosTags = 0; tokens = Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType)); words = new ArrayList<Word>(); add(-1, sentence, "S"); int tokenNo = 0; for (Annotation token : tokens) { String tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString()); add(tokenNo, token, tokenString); /* The FAQ says the parser will automatically use existing POS tags * if the List elements are of type TaggedWord. * http://nlp.stanford.edu/software/parser-faq.shtml#f */ if (usePosTags) { words.add(new TaggedWord(tokenString, getEscapedPosTag(token))); } else { words.add(new Word(tokenString)); } tokenNo++; } nbrOfTokens = tokenNo; }
From source file:gate.stanford.Tagger.java
License:Open Source License
@Override public void execute() throws ExecutionException { // check the parameters if (document == null) throw new ExecutionException("No document to process!"); AnnotationSet inputAS = document.getAnnotations(inputASName); if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) { throw new ExecutionException("No base Token Annotation Type provided!"); }//from w w w .j a v a 2 s . c o m if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) { throw new ExecutionException("No base Sentence Annotation Type provided!"); } if (outputAnnotationType == null || outputAnnotationType.trim().length() == 0) { throw new ExecutionException("No AnnotationType provided to store the new feature!"); } AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType); AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType); if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) { long startTime = System.currentTimeMillis(); fireStatusChanged("POS tagging " + document.getName()); fireProgressChanged(0); // prepare the input for MaxentTagger List<Word> sentenceForTagger = new ArrayList<Word>(); // define a comparator for annotations by start offset OffsetComparator offsetComparator = new OffsetComparator(); // read all the tokens and all the sentences List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS); Collections.sort(sentencesList, offsetComparator); List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS); Collections.sort(tokensList, offsetComparator); Iterator<Annotation> sentencesIter = sentencesList.iterator(); ListIterator<Annotation> tokensIter = tokensList.listIterator(); List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>(); Annotation currentToken = tokensIter.next(); int sentIndex = 0; int sentCnt = sentencesAS.size(); while (sentencesIter.hasNext()) { Annotation currentSentence = sentencesIter.next(); tokensInCurrentSentence.clear(); sentenceForTagger.clear(); while (currentToken != null && currentToken.getEndNode().getOffset() .compareTo(currentSentence.getEndNode().getOffset()) <= 0) { // If we're only POS tagging Tokens within baseSentenceAnnotationType, // don't add the sentence if the Tokens aren't within the span of // baseSentenceAnnotationType if (posTagAllTokens || currentToken.withinSpanOf(currentSentence)) { tokensInCurrentSentence.add(currentToken); if (useExistingTags && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) { sentenceForTagger.add(new TaggedWord( (String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME), (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME))); } else { sentenceForTagger.add( new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME))); } } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } // if the sentence doesn't contain any tokens (which is a bit weird but // is possible) then don't try running the POS tagger as you will get an // array index out of bounds exception if (sentenceForTagger.isEmpty()) continue; // run the POS tagger ArrayList<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags); // add the results // make sure no malfunction occurred if (taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size() + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!"); Iterator<TaggedWord> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); while (resIter.hasNext()) { Annotation annot = tokIter.next(); addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag())); } fireProgressChanged(sentIndex++ * 100 / sentCnt); } // while(sentencesIter.hasNext()) if (currentToken != null && posTagAllTokens) { // Tag remaining Tokens if we are not considering those only within // baseSentenceAnnotationType // we have remaining tokens after the last sentence tokensInCurrentSentence.clear(); sentenceForTagger.clear(); while (currentToken != null) { tokensInCurrentSentence.add(currentToken); if (useExistingTags && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) { sentenceForTagger.add( new TaggedWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME), (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME))); } else { sentenceForTagger .add(new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME))); } currentToken = (tokensIter.hasNext() ? tokensIter.next() : null); } // run the POS tagger on remaining tokens List<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags); // add the results and make sure no malfunction occurred if (taggerResults.size() != tokensInCurrentSentence.size()) throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size() + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!"); Iterator<TaggedWord> resIter = taggerResults.iterator(); Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator(); while (resIter.hasNext()) { Annotation annot = tokIter.next(); addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag())); } } // if(currentToken != null) fireProcessFinished(); fireStatusChanged(document.getName() + " tagged in " + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000) + " seconds!"); } else { if (failOnMissingInputAnnotations) { throw new ExecutionException("No sentences or tokens to process in document " + document.getName() + "\n" + "Please run a sentence splitter " + "and tokeniser first!"); } else { Utils.logOnce(logger, Level.INFO, "POS tagger: no sentence or token annotations in input document - see debug log for details."); logger.debug("No input annotations in document " + document.getName()); } } }