List of usage examples for edu.stanford.nlp.process DocumentPreprocessor iterator
@Override
public Iterator<List<HasWord>> iterator()
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>//from w w w.j a v a 2 s . c o m * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:com.diskoverorta.osdep.StanfordNLP.java
License:Apache License
public List<String> splitSentencesINDocument(String sDoc) { Reader reader = new StringReader(sDoc); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); }//w w w . j a v a 2 s . c o m sentenceSb.append(token); } sentenceList.add(sentenceSb.toString().trim()); } return sentenceList; }
From source file:com.summarizer.Utilities.java
License:Apache License
public static String[] sentenceTokonizer(String entireDoc) { Reader reader = new StringReader(entireDoc); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new LinkedList<String>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); }//from w w w . java 2s. c om sentenceSb.append(token.word()); } sentenceList.add(sentenceSb.toString()); } return (String[]) sentenceList.toArray(new String[sentenceList.size()]); }
From source file:delic.Document.java
/** * Stanford NLP library based sentence segmentation * //from www .ja v a 2 s . co m * @return Arraylist of sentences */ public Iterator<Sentence> getSentencePOSBased() { Reader reader = new StringReader(this.docText); DocumentPreprocessor dp = new DocumentPreprocessor(reader); ArrayList<Sentence> sentenceList = new ArrayList<Sentence>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() >= 1) { sentenceSb.append(" "); } sentenceSb.append(token); } String sentenceStr = sentenceSb.toString(); if (sentenceStr.length() > 20) { int parts = sentenceStr.length() / 20; String pieces[] = new String[parts]; int i = 0; while (i < parts) { pieces[i] = sentenceStr.substring(i * 20, i * 20 + 20); i++; } i = 0; while (i < parts) { sentenceList.add(new Sentence(pieces[i++])); } } else { sentenceList.add(new Sentence(sentenceStr)); } } return sentenceList.iterator(); }
From source file:File.SplitintoSentences.java
public List<String> getSentences(String filetext) { List<String> sentences = new ArrayList<String>(); Reader reader = new StringReader(filetext); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new LinkedList<String>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); }/*from w ww .j a v a2s . com*/ sentenceSb.append(token); } sentenceList.add(sentenceSb.toString()); } for (String sentence : sentenceList) { // System.out.println(sentence); sentences.add(sentence); } return sentences; }
From source file:fr.lipn.yasemir.ontology.annotation.SentenceBasedAnnotator.java
License:Open Source License
/** * Method that uses DocumentPreprocessor from Stanford Parser to split text into sentences * @param text/* w w w . j av a 2 s . co m*/ * @return */ private Vector<String> getSentences(String text) { Vector<String> sentenceList = new Vector<String>(); DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); } sentenceSb.append(token); } sentenceList.add(sentenceSb.toString()); } /* for(String sentence:sentenceList) { System.err.println(sentence); } */ return sentenceList; }
From source file:org.ets.research.nlp.stanford_thrift.parser.StanfordParserThrift.java
License:Open Source License
public List<ParseTree> parse_text(String text, List<String> outputFormat) throws TApplicationException { List<ParseTree> results = new ArrayList<ParseTree>(); try {// www . j a va2s . c om treePrinter = ParserUtil.setOptions(outputFormat, tlp); // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(text)); Iterator<List<HasWord>> foundSentences = preprocess.iterator(); while (foundSentences.hasNext()) { Tree parseTree = parser.apply(foundSentences.next()); results.add( new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score())); } } catch (Exception e) { // FIXME throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage()); } return results; }
From source file:org.ets.research.nlp.stanford_thrift.tagger.StanfordTaggerThrift.java
License:Open Source License
public List<List<TaggedToken>> tag_text(String untokenizedText) { List<List<TaggedToken>> taggedAndTokenizedSentences = new ArrayList<List<TaggedToken>>(); // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(untokenizedText)); Iterator<List<HasWord>> foundSentences = preprocess.iterator(); while (foundSentences.hasNext()) { taggedAndTokenizedSentences.add(tagSingleSentence(foundSentences.next())); }//w ww . ja v a 2 s. c om return taggedAndTokenizedSentences; }
From source file:org.ets.research.nlp.stanford_thrift.tokenizer.StanfordTokenizerThrift.java
License:Open Source License
public List<List<String>> tokenizeText(String arbitraryText) { List<List<String>> tokenizedSentences = new ArrayList<List<String>>(); DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(arbitraryText)); Iterator<List<HasWord>> foundSentences = preprocess.iterator(); while (foundSentences.hasNext()) { List<HasWord> tokenizedSentence = foundSentences.next(); List<String> tokenizedSentenceAsListOfStrings = new ArrayList<String>(); for (HasWord w : tokenizedSentence) { tokenizedSentenceAsListOfStrings.add(w.word()); }/*from w w w . ja va2 s . com*/ tokenizedSentences.add(tokenizedSentenceAsListOfStrings); } return tokenizedSentences; }
From source file:parser.StanfordTokenizer.java
public static ArrayList<String> tokenizeSents(String sent) { Reader reader = new StringReader(sent); DocumentPreprocessor dp = new DocumentPreprocessor(reader); ArrayList<String> sentenceList = new ArrayList<String>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); }//www . j a va 2 s. com sentenceSb.append(token); } sentenceList.add(sentenceSb.toString()); } return sentenceList; }