Example usage for edu.stanford.nlp.ling Word Word

List of usage examples for edu.stanford.nlp.ling Word Word

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling Word Word.

Prototype

public Word(Label lab) 

Source Link

Document

Creates a new word whose word value is the value of any class that supports the Label interface.

Usage

From source file:DependencyParse.java

License:Apache License

public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) {
        System.err.println(//from w  w  w .java2 s . c  om
                "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>");
        System.exit(1);
    }

    boolean tokenize = false;
    if (props.containsKey("tokenize")) {
        tokenize = true;
    }

    String tokPath = props.getProperty("tokpath");
    String parentPath = props.getProperty("parentpath");
    String relPath = props.getProperty("relpath");

    BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
    BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
    BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath));

    MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL);
    DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL);
    Scanner stdin = new Scanner(System.in);
    int count = 0;
    long start = System.currentTimeMillis();
    while (stdin.hasNextLine()) {
        String line = stdin.nextLine();
        List<HasWord> tokens = new ArrayList<>();
        if (tokenize) {
            PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
            for (Word label; tokenizer.hasNext();) {
                tokens.add(tokenizer.next());
            }
        } else {
            for (String word : line.split(" ")) {
                tokens.add(new Word(word));
            }
        }

        List<TaggedWord> tagged = tagger.tagSentence(tokens);

        int len = tagged.size();
        Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies();
        int[] parents = new int[len];
        for (int i = 0; i < len; i++) {
            // if a node has a parent of -1 at the end of parsing, then the node
            // has no parent.
            parents[i] = -1;
        }

        String[] relns = new String[len];
        for (TypedDependency td : tdl) {
            // let root have index 0
            int child = td.dep().index();
            int parent = td.gov().index();
            relns[child - 1] = td.reln().toString();
            parents[child - 1] = parent;
        }

        // print tokens
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            if (tokenize) {
                sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
            } else {
                sb.append(tokens.get(i).word());
            }
            sb.append(' ');
        }
        if (tokenize) {
            sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
        } else {
            sb.append(tokens.get(len - 1).word());
        }
        sb.append('\n');
        tokWriter.write(sb.toString());

        // print parent pointers
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(parents[i]);
            sb.append(' ');
        }
        sb.append(parents[len - 1]);
        sb.append('\n');
        parentWriter.write(sb.toString());

        // print relations
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(relns[i]);
            sb.append(' ');
        }
        sb.append(relns[len - 1]);
        sb.append('\n');
        relWriter.write(sb.toString());

        count++;
        if (count % 1000 == 0) {
            double elapsed = (System.currentTimeMillis() - start) / 1000.0;
            System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
        }
    }

    long totalTimeMillis = System.currentTimeMillis() - start;
    System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0,
            totalTimeMillis / (double) count);
    tokWriter.close();
    parentWriter.close();
    relWriter.close();
}

From source file:ConstituencyParse.java

License:Apache License

public List<HasWord> sentenceToTokens(String line) {
    List<HasWord> tokens = new ArrayList<>();
    if (tokenize) {
        PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
        for (Word label; tokenizer.hasNext();) {
            tokens.add(tokenizer.next());
        }//  www . ja va2s .  co m
    } else {
        for (String word : line.split(" ")) {
            tokens.add(new Word(word));
        }
    }

    return tokens;
}

From source file:com.daemon.sentiment.FeatureMatrix.java

License:Open Source License

/**
 * POS tagging features//from   w  w w  .j  av  a  2 s.  co  m
 * 
 * Words are tagged with their respective part-of-speech tag as determined
 * by the Stanford parser
 * 
 * @param tokens
 *            Tokenized text of the tweet
 * @param tokensPOSTagged
 *            Tokenized text of the tweet, possibly with negations from the
 *            previous step
 * @return Reference to the second parameter, which now has POS annotations,
 *         e.g. "love $NN$"
 */
private List<String> addPOSTags(List<String> tokens, List<String> tokensPOSTagged) {
    Tree stanfordTree;

    // Parser needs the tokens-list in a HasWord format
    List<HasWord> sentence = new ArrayList<HasWord>();
    for (String token : tokens) {
        sentence.add(new Word(token));
    }

    // Parse the sentence
    stanfordTree = lexicalizedParser.apply(sentence);

    // add results (POS tags) in tokensPOSTagged-list
    int i = 0;
    for (CoreLabel label : stanfordTree.taggedLabeledYield()) {
        tokensPOSTagged.set(i, tokensPOSTagged.get(i) + " $" + label.toString("value") + "$");
        i++;
    }

    return tokensPOSTagged;

}

From source file:edu.iastate.airl.semtus.parser.Parser.java

License:Open Source License

/**
 * Get morphology base/* w ww .  jav a 2 s.co m*/
 *
 * @param thisString
 *            string
 * @return morphology base
 */
static public String morphology(String thisString) {
    Word thisWord = new Word(thisString);
    return morphology(thisWord);
}

From source file:edu.iastate.airl.semtus.processor.InputProcessor.java

License:Open Source License

public static ArrayList<Sentence<Word>> getSentences(final String[] params) {

    String processedInput = params[1].replaceAll("[^a-zA-Z0-9.?&\t\n\r\b:; ]", "");

    processedInput = processedInput.replaceAll("&", "and");

    processedInput = processedInput.replaceAll("[:;]", ".");

    processedInput = processedInput.replaceAll("\t\n\r\b", " ");

    StringTokenizer tokenizer = new StringTokenizer(processedInput, ".?");

    ArrayList<Sentence<Word>> sentenceList = new ArrayList<Sentence<Word>>();

    String sentence;// w  ww . j av  a2s.c om

    while (tokenizer.hasMoreTokens()) {

        sentence = tokenizer.nextToken();

        if (sentence == null || sentence.trim().equals("") == true) {

            continue;
        }

        StringTokenizer wordTokenizer = new StringTokenizer(sentence, " ");

        Sentence<Word> sent = new Sentence<Word>();

        while (wordTokenizer.hasMoreTokens()) {

            sent.add(new Word(wordTokenizer.nextToken()));
        }

        sentenceList.add(sent);
    }

    return sentenceList;
}

From source file:edu.washington.phrasal.feature.SentenceIdPhrasalVerbId.java

private List<String> tagWordTokens(List<String> tokens) {
    List<HasWord> hw = tokens.stream().map((t) -> new Word(t)).collect(Collectors.toList());
    /* convert tag back to string */
    return fg.getTagger().apply(hw).stream().map((pos) -> pos.tag()).collect(Collectors.toList());
}

From source file:elkfed.mmax.pipeline.StanfordParser.java

License:Apache License

/** Adds a parse tree to forest for each sentence in the document */
protected void annotateDocument() {
    String[][] sentences = null;/*w ww . jav  a 2  s .c  o m*/
    try {
        sentences = DiscourseUtils.getSentenceTokens(currentDocument);
    } catch (Exception mmax2e) {
        mmax2e.printStackTrace();
    }

    for (int sentence = 0; sentence < sentences.length; sentence++) {
        List<Word> words = new ArrayList<Word>();
        String[] tempSent = new String[sentences[sentence].length];
        int i = 0;
        for (String tok : sentences[sentence]) {
            String s = tok.replaceAll("\\(", "-LRB-");
            s = s.replaceAll("\\)", "-RRB-");
            words.add(new Word(s));
        }
        Tree parse = (Tree) lp.apply(words);
        forest.add(normalizeTree(parse));
    }
}

From source file:fyp_backend.Stemmer.java

/**
 * Stems <code>w</code> and returns stemmed <code>Word</code>.
 */

public Word stem(Word w) {
    return (new Word(stem(w.word())));
}

From source file:gate.stanford.StanfordSentence.java

License:Open Source License

public StanfordSentence(Annotation sentence, String tokenType, AnnotationSet inputAS, boolean usePosTags) {

    startPosToOffset = new HashMap<Integer, Long>();
    endPosToOffset = new HashMap<Integer, Long>();
    startPosToToken = new HashMap<Integer, Annotation>();
    startPosToString = new HashMap<Integer, String>();

    sentenceStartOffset = sentence.getStartNode().getOffset();
    sentenceEndOffset = sentence.getEndNode().getOffset();

    nbrOfTokens = 0;//  ww w .  ja va2 s .  co  m
    nbrOfMissingPosTags = 0;

    tokens = Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset, sentenceEndOffset).get(tokenType));
    words = new ArrayList<Word>();

    add(-1, sentence, "S");

    int tokenNo = 0;

    for (Annotation token : tokens) {
        String tokenString = escapeToken(token.getFeatures().get(STRING_FEATURE).toString());
        add(tokenNo, token, tokenString);

        /* The FAQ says the parser will automatically use existing POS tags
         * if the List elements are of type TaggedWord.  
         * http://nlp.stanford.edu/software/parser-faq.shtml#f
         */

        if (usePosTags) {
            words.add(new TaggedWord(tokenString, getEscapedPosTag(token)));
        } else {
            words.add(new Word(tokenString));
        }

        tokenNo++;
    }

    nbrOfTokens = tokenNo;
}

From source file:gate.stanford.Tagger.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);

    if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Token Annotation Type provided!");
    }//from w  w w .j a  v a 2  s  .  c  o  m

    if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Sentence Annotation Type provided!");
    }

    if (outputAnnotationType == null || outputAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No AnnotationType provided to store the new feature!");
    }

    AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
    AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
    if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) {
        long startTime = System.currentTimeMillis();
        fireStatusChanged("POS tagging " + document.getName());
        fireProgressChanged(0);
        // prepare the input for MaxentTagger
        List<Word> sentenceForTagger = new ArrayList<Word>();

        // define a comparator for annotations by start offset
        OffsetComparator offsetComparator = new OffsetComparator();

        // read all the tokens and all the sentences
        List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
        Collections.sort(sentencesList, offsetComparator);
        List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
        Collections.sort(tokensList, offsetComparator);

        Iterator<Annotation> sentencesIter = sentencesList.iterator();
        ListIterator<Annotation> tokensIter = tokensList.listIterator();

        List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
        Annotation currentToken = tokensIter.next();
        int sentIndex = 0;
        int sentCnt = sentencesAS.size();
        while (sentencesIter.hasNext()) {
            Annotation currentSentence = sentencesIter.next();
            tokensInCurrentSentence.clear();
            sentenceForTagger.clear();
            while (currentToken != null && currentToken.getEndNode().getOffset()
                    .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {
                // If we're only POS tagging Tokens within baseSentenceAnnotationType,
                // don't add the sentence if the Tokens aren't within the span of
                // baseSentenceAnnotationType
                if (posTagAllTokens || currentToken.withinSpanOf(currentSentence)) {
                    tokensInCurrentSentence.add(currentToken);

                    if (useExistingTags
                            && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
                        sentenceForTagger.add(new TaggedWord(
                                (String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME),
                                (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)));
                    } else {
                        sentenceForTagger.add(
                                new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)));
                    }
                }
                currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
            }

            // if the sentence doesn't contain any tokens (which is a bit weird but
            // is possible) then don't try running the POS tagger as you will get an
            // array index out of bounds exception
            if (sentenceForTagger.isEmpty())
                continue;

            // run the POS tagger
            ArrayList<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags);

            // add the results
            // make sure no malfunction occurred
            if (taggerResults.size() != tokensInCurrentSentence.size())
                throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size()
                        + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!");
            Iterator<TaggedWord> resIter = taggerResults.iterator();
            Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
            while (resIter.hasNext()) {
                Annotation annot = tokIter.next();
                addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag()));
            }
            fireProgressChanged(sentIndex++ * 100 / sentCnt);
        } // while(sentencesIter.hasNext())

        if (currentToken != null && posTagAllTokens) {
            // Tag remaining Tokens if we are not considering those only within
            // baseSentenceAnnotationType

            // we have remaining tokens after the last sentence
            tokensInCurrentSentence.clear();
            sentenceForTagger.clear();
            while (currentToken != null) {
                tokensInCurrentSentence.add(currentToken);
                if (useExistingTags && currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
                    sentenceForTagger.add(
                            new TaggedWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME),
                                    (String) currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)));
                } else {
                    sentenceForTagger
                            .add(new Word((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME)));
                }
                currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
            }

            // run the POS tagger on remaining tokens
            List<TaggedWord> taggerResults = tagger.tagSentence(sentenceForTagger, useExistingTags);

            // add the results and make sure no malfunction occurred
            if (taggerResults.size() != tokensInCurrentSentence.size())
                throw new ExecutionException("POS Tagger malfunction: the output size (" + taggerResults.size()
                        + ") is different from the input size (" + tokensInCurrentSentence.size() + ")!");
            Iterator<TaggedWord> resIter = taggerResults.iterator();
            Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
            while (resIter.hasNext()) {
                Annotation annot = tokIter.next();
                addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String) resIter.next().tag()));
            }
        } // if(currentToken != null)
        fireProcessFinished();
        fireStatusChanged(document.getName() + " tagged in "
                + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
                + " seconds!");
    } else {
        if (failOnMissingInputAnnotations) {
            throw new ExecutionException("No sentences or tokens to process in document " + document.getName()
                    + "\n" + "Please run a sentence splitter " + "and tokeniser first!");
        } else {
            Utils.logOnce(logger, Level.INFO,
                    "POS tagger: no sentence or token annotations in input document - see debug log for details.");
            logger.debug("No input annotations in document " + document.getName());
        }
    }

}