List of usage examples for edu.stanford.nlp.ling Word word
public String word();
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java
License:Open Source License
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { List<Token> casTokens = null; // Use value from language parameter, document language or fallback language - whatever // is available String language = getLanguage(aJCas); if (isWriteToken()) { casTokens = new ArrayList<Token>(); final String text = aText; final Tokenizer<?> tokenizer = getTokenizer(language, aText); int offsetInSentence = 0; List<?> tokens = tokenizer.tokenize(); outer: for (int i = 0; i < tokens.size(); i++) { final Object token = tokens.get(i); // System.out.println("Token class: "+token.getClass()); String t = null;// w w w.j a va 2 s .co m if (token instanceof String) { t = (String) token; } if (token instanceof CoreLabel) { CoreLabel l = (CoreLabel) token; t = l.word(); int begin = l.get(CharacterOffsetBeginAnnotation.class); int end = l.get(CharacterOffsetEndAnnotation.class); casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i)); offsetInSentence = end; continue; } if (token instanceof Word) { Word w = (Word) token; t = w.word(); } if (t == null) { throw new AnalysisEngineProcessException( new IllegalStateException("Unknown token type: " + token.getClass())); } // Skip whitespace while (isWhitespace(text.charAt(offsetInSentence))) { offsetInSentence++; if (offsetInSentence >= text.length()) { break outer; } } // Match if (text.startsWith(t, offsetInSentence)) { casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence, aZoneBegin + offsetInSentence + t.length(), i)); offsetInSentence = offsetInSentence + t.length(); } else { // System.out.println(aText); throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: [" + t + "] CAS: [" + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length())))); } } } if (isWriteSentence()) { if (casTokens == null) { casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length()); } // Prepare the tokens for processing by WordToSentenceProcessor List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); for (Token token : casTokens) { CoreLabel l = new CoreLabel(); l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); l.setWord(token.getCoveredText()); tokensInDocument.add(l); } // The sentence splitter (probably) requires the escaped text, so we prepare it here PTBEscapingProcessor escaper = new PTBEscapingProcessor(); escaper.apply(tokensInDocument); // Apply the WordToSentenceProcessor to find the sentence boundaries WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex, boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences); List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); for (List<CoreLabel> sentence : sentencesInDocument) { int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class); createSentence(aJCas, begin, end); } } }
From source file:edu.iastate.airl.semtus.parser.Parser.java
License:Open Source License
/** * Get morphology base/*from w ww . j a v a2 s.c om*/ * * @param thisWord * word * @return morphology base */ static public String morphology(Word thisWord) { try { Morphology thisMorphology = new Morphology(); Word thisBase = thisMorphology.stem(thisWord); return thisBase.word(); } catch (Throwable e) { return null; } }
From source file:edu.isi.mavuno.nlp.NLProcTools.java
License:Apache License
public void setSentence(List<Word> sentence) { // set the sentence words mSentenceWords.clear();/*from ww w . j av a 2 s . co m*/ mSentenceWords.addAll(sentence); // clear sentence tokens mSentenceTokens.clear(); // convert Words to Tokens int pos = 1; for (Word word : sentence) { Token tok = new Token(word.word(), pos); mSentenceTokens.add(tok); pos++; } }
From source file:edu.isi.mavuno.nlp.NLProcTools.java
License:Apache License
public List<String> getChunkTags() { // the chunker requires the output of the pos tagger if (mPosTags.size() == 0) { getPosTags();/*from w w w .ja va 2s .c o m*/ } mChunkerWords.clear(); for (Word w : mSentenceWords) { mChunkerWords.add(w.word()); } return Arrays.asList(mChunker.chunk(mChunkerWords.toArray(new String[mChunkerWords.size()]), mPosTags.toArray(new String[mPosTags.size()]))); }
From source file:org.linuxkernel.proof.digger.questiontypeanalysis.patternbased.MainPartExtracter.java
License:Open Source License
/** * ???//from w w w.j a v a 2s. co m * * @param question * @param questionWords ?? * @return */ public QuestionStructure getMainPart(String question, String questionWords) { List<Word> words = new ArrayList<>(); String[] qw = questionWords.split("\\s+"); for (String item : qw) { item = item.trim(); if ("".equals(item)) { continue; } Word word = new Word(); word.setWord(item.trim()); words.add(word); } return getMainPart(question, words); }
From source file:phrasesentimentextractor.PhraseSentimentExtractor.java
/** * @param args the command line arguments *///from w ww . ja va 2 s . c om public static void main(String[] args) throws FileNotFoundException, IOException { // TODO code application logic here //Initialize all the models //Tokenizer model for the sentence from OpenNLP , tokenizes the sentence // InputStream is = new FileInputStream("en-token.bin"); // // TokenizerModel model = new TokenizerModel(is); // Tokenizer tokenizer = new TokenizerME(model); // // //POS model from OpenNLP, gives the POS tags // POSModel posmodel = new POSModelLoader().load(new File("en-pos-maxent.bin")); // POSTaggerME tagger = new POSTaggerME(posmodel); DependencyTreeGenerator dr = DependencyTreeGenerator.getInstance(); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); //chunker Path filepath = Paths.get("models/en-chunker.bin"); InputStream is = new FileInputStream(filepath.toFile()); ChunkerModel cModel = new ChunkerModel(is); ChunkerME chunkerME = new ChunkerME(cModel); //Output file File output_phrases = new File(args[2]); FileWriter fout = new FileWriter(output_phrases); PrintWriter out = new PrintWriter(fout); //Start processing the review file //Extract all the features Set<String> features = new HashSet(); HashMap<String, List<String>> featuresPhrases = new HashMap(); File feat_input = new File(args[0]); Scanner scanner = new Scanner(feat_input); int feat_counter = 0; String feat = ""; while (scanner.hasNext()) { feat = scanner.nextLine().trim(); features.add(feat); List<String> f_phrases = new ArrayList(); featuresPhrases.put(feat, f_phrases); feat_counter++; } String sentence = ""; File review_text = new File(args[1]); FileReader fileReader = new FileReader(review_text); DocumentPreprocessor dp = new DocumentPreprocessor(fileReader); dp.setTokenizerFactory(tokenizerFactory); int num_lines = 0; for (List line : dp) { boolean feature_exists = false; // sentence = Sentence.listToString(line); Set<String> check_features = new HashSet(); for (String feature : features) { Pattern pattern = Pattern.compile("\\b" + feature.toLowerCase() + "\\b", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(sentence.toLowerCase()); while (matcher.find()) { feature_exists = true; check_features.add(feature); } } if (!feature_exists) { //System.out.println("\n"+sentence); //System.out.println("No feature present!\n"); continue; } //Features present //System.out.println("\nFeatures present\n"); // for(String feature : check_features){ // //System.out.print(feature+" "); // } //get parse tree and construct dependency tree Tree tr = dr.parse(sentence); DependencyTree depTree = dr.getTypedDependencyTree(tr); //get tokenized words //System.out.println("\nTokenized Words\n"); List<Word> word_list = tr.yieldWords(); List<String> word_tokens = new ArrayList(); for (Word word : word_list) { word_tokens.add(word.word()); //System.out.print(word.word()+" "); } String[] words = new String[word_tokens.size()]; words = word_tokens.toArray(words); //System.out.println("\nPOS Tags\n"); List<TaggedWord> postags = tr.taggedYield(); List<String> tag_tokens = new ArrayList(); for (TaggedWord postag : postags) { tag_tokens.add(postag.tag()); System.out.print(postag.tag() + " "); } String[] tags = new String[tag_tokens.size()]; tags = tag_tokens.toArray(tags); //System.out.println("\nBIO Encoding\n"); //BIO encoding for sentence String result[] = chunkerME.chunk(words, tags); for (String r : result) { System.out.print(r + " "); } //System.out.println("\nPhrases\n"); //Outputs spans of BIO-NP HashMap<Integer, Integer> span_map = new HashMap(); Span[] span = chunkerME.chunkAsSpans(words, tags); int j = 0; ArrayList<PhraseSet> pSets = new ArrayList(); for (Span s : span) { ArrayList<String> phrase_words = new ArrayList(); //System.out.print("\n"+s.toString()+" "); int n = 0; for (int i = s.getStart(); i < s.getEnd(); i++) { System.out.print(words[i] + " "); span_map.put(i, j); phrase_words.add(words[i]); n++; } PhraseSet pSet = new PhraseSet(j, s.toString(), phrase_words); pSets.add(pSet); j++; } //RootWord //Actual root is dummy DependencyTreeNode rootNode = depTree.getVertex(0).edges.get(0).target; Queue<DependencyTreeNode> queue = new LinkedList(); rootNode.parent = null; queue.add(rootNode); while (!queue.isEmpty()) { DependencyTreeNode u = queue.remove(); u.pos = tags[u.index - 1]; if (span_map.get(u.index - 1) != null) { u.phrase_index = span_map.get(u.index - 1); } else { u.phrase_index = -1; } //System.out.println("\n"+u.word+"-"+u.phrase_index+"-"+tags[u.index-1]); for (DependencyTreeEdge e : u.edges) { e.target.parent = u; queue.add(e.target); //System.out.print(e.target.word+" "); } } HashMap<String, List<String>> featurePhrases = SentimentExtract.getSentimentPhrases(check_features, pSets, depTree); for (String chk_feat : check_features) { featuresPhrases.get(chk_feat).addAll(featurePhrases.get(chk_feat)); } num_lines++; } System.out.println(num_lines); for (String f : features) { out.print(f + " "); out.print(String.join(" ", featuresPhrases.get(f))); out.println(); } System.out.println("Success"); out.close(); }
From source file:qmul.util.parse.PennTreebankTokenizer.java
License:Open Source License
/** * @param s//from ww w.j a v a2 s . co m * @return a list of tokens "hello, bob." -> "[hello, ,, bob, .]" */ public List<Word> getWordsFromString(String s) { if (splitPennAbbreviations) { s = s.replaceAll("(\\w+)([.,?!;:])(\\s+|$)", "$1 $2"); } List<Word> words = dp.getWordsFromString(s); for (Word w : words) { // Penn tokeniser transforms "a/b" into "a\/b" if (w.word().contains("/")) { w.setWord(w.word().replaceAll("\\\\/", "/")); } } return words; }