List of usage examples for edu.stanford.nlp.process DocumentPreprocessor setTokenizerFactory
public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory)
From source file:de.uni_stuttgart.ims.comparatives.nlp.SentenceSplitterStanford.java
License:Creative Commons License
/** * Split the string into sentences with Stanford. * @return List of spans with the start/end positions of each sentence. *///from w w w . j a v a 2 s.c om public TextSpan[] split(String document) { StringReader reader = new StringReader(document); DocumentPreprocessor dp = new DocumentPreprocessor(reader); dp.setTokenizerFactory(ptbTokenizerFactory); ArrayList<TextSpan> sentenceSpansList = new ArrayList<TextSpan>(); for (List<HasWord> sent : dp) { CoreLabel firstword = (CoreLabel) sent.get(0); CoreLabel lastword = (CoreLabel) sent.get(sent.size() - 1); String coveredText = ""; for (int i = 0; i < sent.size(); i++) { CoreLabel word = (CoreLabel) sent.get(i); coveredText += word.value() + " "; } sentenceSpansList.add(new TextSpan(firstword.beginPosition(), lastword.endPosition(), coveredText)); } return sentenceSpansList.toArray(new TextSpan[0]); }
From source file:flight_ranker.TaggerDemo2.java
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return;//from w ww . j a va 2 s . c om } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
From source file:phrasesentimentextractor.PhraseSentimentExtractor.java
/** * @param args the command line arguments *//*w ww.j a v a 2 s . c o m*/ public static void main(String[] args) throws FileNotFoundException, IOException { // TODO code application logic here //Initialize all the models //Tokenizer model for the sentence from OpenNLP , tokenizes the sentence // InputStream is = new FileInputStream("en-token.bin"); // // TokenizerModel model = new TokenizerModel(is); // Tokenizer tokenizer = new TokenizerME(model); // // //POS model from OpenNLP, gives the POS tags // POSModel posmodel = new POSModelLoader().load(new File("en-pos-maxent.bin")); // POSTaggerME tagger = new POSTaggerME(posmodel); DependencyTreeGenerator dr = DependencyTreeGenerator.getInstance(); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); //chunker Path filepath = Paths.get("models/en-chunker.bin"); InputStream is = new FileInputStream(filepath.toFile()); ChunkerModel cModel = new ChunkerModel(is); ChunkerME chunkerME = new ChunkerME(cModel); //Output file File output_phrases = new File(args[2]); FileWriter fout = new FileWriter(output_phrases); PrintWriter out = new PrintWriter(fout); //Start processing the review file //Extract all the features Set<String> features = new HashSet(); HashMap<String, List<String>> featuresPhrases = new HashMap(); File feat_input = new File(args[0]); Scanner scanner = new Scanner(feat_input); int feat_counter = 0; String feat = ""; while (scanner.hasNext()) { feat = scanner.nextLine().trim(); features.add(feat); List<String> f_phrases = new ArrayList(); featuresPhrases.put(feat, f_phrases); feat_counter++; } String sentence = ""; File review_text = new File(args[1]); FileReader fileReader = new FileReader(review_text); DocumentPreprocessor dp = new DocumentPreprocessor(fileReader); dp.setTokenizerFactory(tokenizerFactory); int num_lines = 0; for (List line : dp) { boolean feature_exists = false; // sentence = Sentence.listToString(line); Set<String> check_features = new HashSet(); for (String feature : features) { Pattern pattern = Pattern.compile("\\b" + feature.toLowerCase() + "\\b", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(sentence.toLowerCase()); while (matcher.find()) { feature_exists = true; check_features.add(feature); } } if (!feature_exists) { //System.out.println("\n"+sentence); //System.out.println("No feature present!\n"); continue; } //Features present //System.out.println("\nFeatures present\n"); // for(String feature : check_features){ // //System.out.print(feature+" "); // } //get parse tree and construct dependency tree Tree tr = dr.parse(sentence); DependencyTree depTree = dr.getTypedDependencyTree(tr); //get tokenized words //System.out.println("\nTokenized Words\n"); List<Word> word_list = tr.yieldWords(); List<String> word_tokens = new ArrayList(); for (Word word : word_list) { word_tokens.add(word.word()); //System.out.print(word.word()+" "); } String[] words = new String[word_tokens.size()]; words = word_tokens.toArray(words); //System.out.println("\nPOS Tags\n"); List<TaggedWord> postags = tr.taggedYield(); List<String> tag_tokens = new ArrayList(); for (TaggedWord postag : postags) { tag_tokens.add(postag.tag()); System.out.print(postag.tag() + " "); } String[] tags = new String[tag_tokens.size()]; tags = tag_tokens.toArray(tags); //System.out.println("\nBIO Encoding\n"); //BIO encoding for sentence String result[] = chunkerME.chunk(words, tags); for (String r : result) { System.out.print(r + " "); } //System.out.println("\nPhrases\n"); //Outputs spans of BIO-NP HashMap<Integer, Integer> span_map = new HashMap(); Span[] span = chunkerME.chunkAsSpans(words, tags); int j = 0; ArrayList<PhraseSet> pSets = new ArrayList(); for (Span s : span) { ArrayList<String> phrase_words = new ArrayList(); //System.out.print("\n"+s.toString()+" "); int n = 0; for (int i = s.getStart(); i < s.getEnd(); i++) { System.out.print(words[i] + " "); span_map.put(i, j); phrase_words.add(words[i]); n++; } PhraseSet pSet = new PhraseSet(j, s.toString(), phrase_words); pSets.add(pSet); j++; } //RootWord //Actual root is dummy DependencyTreeNode rootNode = depTree.getVertex(0).edges.get(0).target; Queue<DependencyTreeNode> queue = new LinkedList(); rootNode.parent = null; queue.add(rootNode); while (!queue.isEmpty()) { DependencyTreeNode u = queue.remove(); u.pos = tags[u.index - 1]; if (span_map.get(u.index - 1) != null) { u.phrase_index = span_map.get(u.index - 1); } else { u.phrase_index = -1; } //System.out.println("\n"+u.word+"-"+u.phrase_index+"-"+tags[u.index-1]); for (DependencyTreeEdge e : u.edges) { e.target.parent = u; queue.add(e.target); //System.out.print(e.target.word+" "); } } HashMap<String, List<String>> featurePhrases = SentimentExtract.getSentimentPhrases(check_features, pSets, depTree); for (String chk_feat : check_features) { featuresPhrases.get(chk_feat).addAll(featurePhrases.get(chk_feat)); } num_lines++; } System.out.println(num_lines); for (String f : features) { out.print(f + " "); out.print(String.join(" ", featuresPhrases.get(f))); out.println(); } System.out.println("Success"); out.close(); }
From source file:reck.parser.lexparser.RECKLexicalizedParser.java
License:Open Source License
/** Parse the files with names given in the String array args elements from * index argIndex on./*from www. j a va2 s. c o m*/ */ public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized, TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document, DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper, int tagDelimiter) { ArrayList treeList = new ArrayList(); PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); RECKTreePrint treePrint = getRECKTreePrint(op); int numWords = 0; int numSents = 0; int numUnparsable = 0; int numNoMemory = 0; int numFallback = 0; int numSkipped = 0; Timing timer = new Timing(); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); // set the tokenizer if (tokenized) { tokenizerFactory = WhitespaceTokenizer.factory(); } if (tokenizerFactory == null) { tokenizerFactory = tlp.getTokenizerFactory(); } if (Test.verbose) { System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory); System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords())); System.err.println("File encoding is: " + op.tlpParams.getInputEncoding()); } documentPreprocessor.setTokenizerFactory(tokenizerFactory); documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords()); documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding()); boolean saidMemMessage = false; // evaluation setup boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages")); boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary")); AbstractEval.ScoreEval pcfgLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) { pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages); } AbstractEval.ScoreEval depLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) { depLL = new AbstractEval.ScoreEval("depLL", runningAverages); } AbstractEval.ScoreEval factLL = null; if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) { factLL = new AbstractEval.ScoreEval("factLL", runningAverages); } /** Hide for performance timer.start(); System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/ PrintWriter pwo = pwOut; int num = 0, docIndex = startSentence; for (List sentence : document) { // System.out.println(sentence.toString()); num++; numSents++; int len = sentence.size(); numWords += len; Tree ansTree = null; try { if (!parse(sentence)) { pwErr.print("Sentence couldn't be parsed by grammar."); if (pparser != null && pparser.hasParse() && fallbackToPCFG) { pwErr.println("... falling back to PCFG parse."); ansTree = getBestPCFGParse(); numFallback++; } else { pwErr.println(); numUnparsable++; } } else { // System.out.println("Score: " + lp.pparser.bestScore); ansTree = getBestParse(); } if (pcfgLL != null && pparser != null) { pcfgLL.recordScore(pparser, pwErr); } if (depLL != null && dparser != null) { depLL.recordScore(dparser, pwErr); } if (factLL != null && bparser != null) { factLL.recordScore(bparser, pwErr); } } catch (OutOfMemoryError e) { if (Test.maxLength != -0xDEADBEEF) { // this means they explicitly asked for a length they cannot handle. Throw exception. pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength); pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength); throw e; } else { if (!saidMemMessage) { printOutOfMemory(pwErr); saidMemMessage = true; } if (pparser.hasParse() && fallbackToPCFG) { try { String what = "dependency"; if (dparser.hasParse()) { what = "factored"; } pwErr.println( "Sentence too long for " + what + " parser. Falling back to PCFG parse..."); ansTree = getBestPCFGParse(); numFallback++; } catch (OutOfMemoryError oome) { oome.printStackTrace(); numNoMemory++; pwErr.println("No memory to gather PCFG parse. Skipping..."); pwo.println("Sentence skipped: no PCFG fallback."); pparser.nudgeDownArraySize(); } } else { pwErr.println( "Sentence has no parse using PCFG grammar (or no PCFG fallback). Skipping..."); pwo.println("Sentence skipped: no PCFG fallback."); numSkipped++; } } } catch (UnsupportedOperationException uEx) { pwErr.println("Sentence too long (or zero words)."); pwo.println("Sentence skipped: too long (or zero words)."); numWords -= len; numSkipped++; } if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } // crude addition of k-best tree printing if (Test.printPCFGkBest > 0 && pparser.hasParse()) { if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) { // DZ: debug n best trees if (ansTree != null) { computePosition(docIndex, (Sentence) sentence, content); TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition); if (TDs.size() > 0) TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size()); RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList, sentencePosition); DPTree = this.splitHyphen_Dependency(DPTree); DPTree = this.splitPoint_Dependency(DPTree); RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content); CTTree = this.splitHyphen_Constituent(CTTree); CTTree = this.splitPoint_Constituent(CTTree); RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree); treeList.add(rpTree); } } docIndex = sentencePosition.getEnd().intValue(); } // for sentence : document if (Test.writeOutputFiles) { pwo.close(); } System.out.println("Parsed file: " + filename + " [" + num + " sentences]."); /** Hide for performance long millis = timer.stop(); if (summary) { if (pcfgLL != null) pcfgLL.display(false, pwErr); if (depLL != null) depLL.display(false, pwErr); if (factLL != null) factLL.display(false, pwErr); }*/ if (saidMemMessage) { printOutOfMemory(pwErr); } /** Hide for performance double wordspersec = numWords / (((double) millis) / 1000); double sentspersec = numSents / (((double) millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! System.out.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec)."); */ if (numFallback > 0) { pwErr.println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.println(" " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength); } } return treeList; }
From source file:weka.filters.unsupervised.attribute.PartOfSpeechTagging.java
License:Open Source License
/** * Obtains the sentences from the document. * * @param doc the document to turn into sentences. * @return the list of sentences/* w w w . j a va2s . co m*/ */ protected List<String> getSentences(String doc) { List<String> result; DocumentPreprocessor preProcessor; result = new ArrayList<String>(); preProcessor = new DocumentPreprocessor(new StringReader(doc)); preProcessor.setTokenizerFactory(getTokenizerFactory()); for (List sentence : preProcessor) result.add(StringUtils.joinWithOriginalWhiteSpace(sentence)); return result; }
From source file:weka.gui.explorer.NLPParseTreePanel.java
License:Open Source License
/** * Obtains the sentences from the document. * * @param doc the document to turn into sentences. * @return the list of sentences/* ww w .ja v a 2s . co m*/ */ protected List<String> getSentences(String doc) { List<String> result; DocumentPreprocessor preProcessor; result = new ArrayList<String>(); try { preProcessor = new DocumentPreprocessor(new StringReader(doc)); preProcessor.setTokenizerFactory(PartOfSpeechTagging.getTokenizerFactory()); for (List sentence : preProcessor) result.add(StringUtils.joinWithOriginalWhiteSpace(sentence)); } catch (Exception e) { showErrorMessage("Parsing error", "Failed to split document into sentences!", e); } return result; }