Example usage for edu.stanford.nlp.ling Word word

List of usage examples for edu.stanford.nlp.ling Word word

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling Word word.

Prototype

public String word();

Source Link

Document

Return the word value of the label (or null if none).

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;// w  w  w.j  a va 2 s  .co  m
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}

From source file:edu.iastate.airl.semtus.parser.Parser.java

License:Open Source License

/**
 * Get morphology base/*from  w  ww . j a v a2 s.c om*/
 *
 * @param thisWord
 *            word
 * @return morphology base
 */
static public String morphology(Word thisWord) {
    try {
        Morphology thisMorphology = new Morphology();
        Word thisBase = thisMorphology.stem(thisWord);
        return thisBase.word();
    } catch (Throwable e) {
        return null;
    }
}

From source file:edu.isi.mavuno.nlp.NLProcTools.java

License:Apache License

public void setSentence(List<Word> sentence) {
    // set the sentence words
    mSentenceWords.clear();/*from  ww w  . j av a 2 s .  co  m*/
    mSentenceWords.addAll(sentence);

    // clear sentence tokens
    mSentenceTokens.clear();

    // convert Words to Tokens
    int pos = 1;
    for (Word word : sentence) {
        Token tok = new Token(word.word(), pos);
        mSentenceTokens.add(tok);
        pos++;
    }
}

From source file:edu.isi.mavuno.nlp.NLProcTools.java

License:Apache License

public List<String> getChunkTags() {
    // the chunker requires the output of the pos tagger
    if (mPosTags.size() == 0) {
        getPosTags();/*from   w  w w  .ja  va  2s .c  o  m*/
    }

    mChunkerWords.clear();

    for (Word w : mSentenceWords) {
        mChunkerWords.add(w.word());
    }

    return Arrays.asList(mChunker.chunk(mChunkerWords.toArray(new String[mChunkerWords.size()]),
            mPosTags.toArray(new String[mPosTags.size()])));
}

From source file:org.linuxkernel.proof.digger.questiontypeanalysis.patternbased.MainPartExtracter.java

License:Open Source License

/**
 * ???//from   w w  w.j a  v  a  2s.  co  m
 *
 * @param question 
 * @param questionWords ??
 * @return 
 */
public QuestionStructure getMainPart(String question, String questionWords) {
    List<Word> words = new ArrayList<>();
    String[] qw = questionWords.split("\\s+");
    for (String item : qw) {
        item = item.trim();
        if ("".equals(item)) {
            continue;
        }
        Word word = new Word();
        word.setWord(item.trim());
        words.add(word);
    }
    return getMainPart(question, words);
}

From source file:phrasesentimentextractor.PhraseSentimentExtractor.java

/**
 * @param args the command line arguments
 *///from w  ww .  ja  va 2 s . c om

public static void main(String[] args) throws FileNotFoundException, IOException {
    // TODO code application logic here

    //Initialize all the models
    //Tokenizer model for the sentence from OpenNLP , tokenizes the sentence
    //        InputStream is = new FileInputStream("en-token.bin");
    //                        
    //        TokenizerModel model = new TokenizerModel(is);
    //   Tokenizer tokenizer = new TokenizerME(model);
    //        
    //        //POS model from OpenNLP, gives the POS tags
    //        POSModel posmodel = new POSModelLoader().load(new File("en-pos-maxent.bin"));
    //        POSTaggerME tagger = new POSTaggerME(posmodel);

    DependencyTreeGenerator dr = DependencyTreeGenerator.getInstance();
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
            "invertible=true");

    //chunker
    Path filepath = Paths.get("models/en-chunker.bin");
    InputStream is = new FileInputStream(filepath.toFile());
    ChunkerModel cModel = new ChunkerModel(is);
    ChunkerME chunkerME = new ChunkerME(cModel);

    //Output file 
    File output_phrases = new File(args[2]);
    FileWriter fout = new FileWriter(output_phrases);
    PrintWriter out = new PrintWriter(fout);

    //Start processing the review file

    //Extract all the features
    Set<String> features = new HashSet();
    HashMap<String, List<String>> featuresPhrases = new HashMap();

    File feat_input = new File(args[0]);
    Scanner scanner = new Scanner(feat_input);
    int feat_counter = 0;
    String feat = "";
    while (scanner.hasNext()) {
        feat = scanner.nextLine().trim();
        features.add(feat);
        List<String> f_phrases = new ArrayList();
        featuresPhrases.put(feat, f_phrases);
        feat_counter++;
    }
    String sentence = "";

    File review_text = new File(args[1]);
    FileReader fileReader = new FileReader(review_text);

    DocumentPreprocessor dp = new DocumentPreprocessor(fileReader);
    dp.setTokenizerFactory(tokenizerFactory);
    int num_lines = 0;

    for (List line : dp) {
        boolean feature_exists = false;
        //            
        sentence = Sentence.listToString(line);
        Set<String> check_features = new HashSet();
        for (String feature : features) {
            Pattern pattern = Pattern.compile("\\b" + feature.toLowerCase() + "\\b", Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(sentence.toLowerCase());
            while (matcher.find()) {
                feature_exists = true;
                check_features.add(feature);
            }

        }
        if (!feature_exists) {
            //System.out.println("\n"+sentence);
            //System.out.println("No feature present!\n");
            continue;
        }

        //Features present
        //System.out.println("\nFeatures present\n");
        //                for(String feature : check_features){
        //                    //System.out.print(feature+" ");
        //                }

        //get parse tree and construct dependency tree    
        Tree tr = dr.parse(sentence);
        DependencyTree depTree = dr.getTypedDependencyTree(tr);

        //get tokenized words
        //System.out.println("\nTokenized Words\n");
        List<Word> word_list = tr.yieldWords();
        List<String> word_tokens = new ArrayList();
        for (Word word : word_list) {
            word_tokens.add(word.word());
            //System.out.print(word.word()+" ");
        }
        String[] words = new String[word_tokens.size()];
        words = word_tokens.toArray(words);

        //System.out.println("\nPOS Tags\n");
        List<TaggedWord> postags = tr.taggedYield();
        List<String> tag_tokens = new ArrayList();
        for (TaggedWord postag : postags) {
            tag_tokens.add(postag.tag());
            System.out.print(postag.tag() + " ");
        }
        String[] tags = new String[tag_tokens.size()];
        tags = tag_tokens.toArray(tags);

        //System.out.println("\nBIO Encoding\n");
        //BIO encoding for sentence

        String result[] = chunkerME.chunk(words, tags);
        for (String r : result) {
            System.out.print(r + " ");
        }

        //System.out.println("\nPhrases\n");
        //Outputs spans of BIO-NP

        HashMap<Integer, Integer> span_map = new HashMap();
        Span[] span = chunkerME.chunkAsSpans(words, tags);
        int j = 0;

        ArrayList<PhraseSet> pSets = new ArrayList();

        for (Span s : span) {

            ArrayList<String> phrase_words = new ArrayList();
            //System.out.print("\n"+s.toString()+" ");
            int n = 0;
            for (int i = s.getStart(); i < s.getEnd(); i++) {
                System.out.print(words[i] + " ");
                span_map.put(i, j);
                phrase_words.add(words[i]);
                n++;
            }

            PhraseSet pSet = new PhraseSet(j, s.toString(), phrase_words);
            pSets.add(pSet);

            j++;
        }

        //RootWord //Actual root is dummy

        DependencyTreeNode rootNode = depTree.getVertex(0).edges.get(0).target;
        Queue<DependencyTreeNode> queue = new LinkedList();
        rootNode.parent = null;
        queue.add(rootNode);

        while (!queue.isEmpty()) {

            DependencyTreeNode u = queue.remove();
            u.pos = tags[u.index - 1];
            if (span_map.get(u.index - 1) != null) {
                u.phrase_index = span_map.get(u.index - 1);

            } else {
                u.phrase_index = -1;
            }
            //System.out.println("\n"+u.word+"-"+u.phrase_index+"-"+tags[u.index-1]);
            for (DependencyTreeEdge e : u.edges) {
                e.target.parent = u;
                queue.add(e.target);
                //System.out.print(e.target.word+" ");
            }

        }

        HashMap<String, List<String>> featurePhrases = SentimentExtract.getSentimentPhrases(check_features,
                pSets, depTree);
        for (String chk_feat : check_features) {
            featuresPhrases.get(chk_feat).addAll(featurePhrases.get(chk_feat));
        }

        num_lines++;
    }

    System.out.println(num_lines);
    for (String f : features) {
        out.print(f + " ");

        out.print(String.join(" ", featuresPhrases.get(f)));
        out.println();
    }

    System.out.println("Success");
    out.close();

}

From source file:qmul.util.parse.PennTreebankTokenizer.java

License:Open Source License

/**
 * @param s//from ww w.j  a v  a2  s .  co  m
 * @return a list of tokens "hello, bob." -> "[hello, ,, bob, .]"
 */
public List<Word> getWordsFromString(String s) {
    if (splitPennAbbreviations) {
        s = s.replaceAll("(\\w+)([.,?!;:])(\\s+|$)", "$1 $2");
    }
    List<Word> words = dp.getWordsFromString(s);
    for (Word w : words) {
        // Penn tokeniser transforms "a/b" into "a\/b"
        if (w.word().contains("/")) {
            w.setWord(w.word().replaceAll("\\\\/", "/"));
        }
    }
    return words;
}