List of usage examples for edu.stanford.nlp.process DocumentPreprocessor DocumentPreprocessor
public DocumentPreprocessor(String docPath)
From source file:Dependency.java
public static void main(String[] args) { String modelPath = DependencyParser.DEFAULT_MODEL; String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"; Scanner sc = new Scanner(System.in); String text = ""; text = sc.nextLine();//from w w w.ja v a 2s .c om // while(text!="exit"){ MaxentTagger tagger = new MaxentTagger(taggerPath); DependencyParser parser = DependencyParser.loadFromModelFile(modelPath); DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); for (List<HasWord> sentence : tokenizer) { List<TaggedWord> tagged = tagger.tagSentence(sentence); Object[] x = tagged.toArray(); GrammaticalStructure gs = parser.predict(tagged); //System.out.println(); Collection<TypedDependency> s = gs.typedDependenciesCollapsedTree(); Object[] z = s.toArray(); System.out.println(tagged.toString()); String token[] = new String[z.length]; String pos[] = new String[z.length]; int k = 0; for (Object i : x) { String str = i.toString(); /*String temp0="(.*?)(?=\\/)"; String temp1="\\/(.*)"; System.out.println(str); Pattern t0 = Pattern.compile("(.*?)(?=\\/)"); Pattern t1 = Pattern.compile("\\/(.*)"); Matcher m0 = t0.matcher(str); Matcher m1 = t1.matcher(str);*/ int index = str.lastIndexOf('/'); token[k] = str.substring(0, index); pos[k] = str.substring(index + 1); //System.out.println(pos[k]); k++; } String rels[] = new String[z.length]; String word1[] = new String[z.length]; String word2[] = new String[z.length]; int j = 0; for (Object i : z) { System.out.println(i); String temp = i.toString(); String pattern0 = "(.*)(?=\\()"; String pattern1 = "(?<=\\()(.*?)(?=-)"; String pattern2 = "(?<=, )(.*)(?=-)"; Pattern r0 = Pattern.compile(pattern0); Pattern r1 = Pattern.compile(pattern1); Pattern r2 = Pattern.compile(pattern2); Matcher m0 = r0.matcher(temp); Matcher m1 = r1.matcher(temp); Matcher m2 = r2.matcher(temp); if (m0.find()) { rels[j] = m0.group(0); //System.out.println(rels[j]); } if (m1.find()) { word1[j] = m1.group(0); } if (m2.find()) { word2[j] = m2.group(0); } j++; } //System.out.println(s); //Rules for feature extraction. //rule1::::::::::::::::: //System.out.println("1"); int[] q = toIntArray(grRecog(rels, "nsubj")); //System.out.println("2"); if (q.length != 0) { //System.out.println("3"); if (posrecog(token, pos, word2[q[0]]).equals("NN")) { //System.out.println("4"); int[] w = toIntArray(grRecog(rels, "compound")); //System.out.println("5"); if (w.length != 0) { System.out.println("6"); System.out.println(word1[q[0]] + "," + word2[q[0]] + "," + word2[w[0]]); } else { int conj_and_index = compgrRecog(rels, word1, word2, "conj:and", word2[q[0]]); if (conj_and_index != -1) { System.out.println( word1[conj_and_index] + "," + word2[conj_and_index] + "," + word2[q[0]]); } else System.out.println(word1[q[0]] + "," + word2[q[0]]); } } //RULE 2::::::::::::: else if (posrecog(token, pos, word1[q[0]]).equals("JJ")) { //System.out.println("aaaaa_JJ"); int a = compgrRecog(rels, word1, word2, "xcomp", word1[q[0]]); if (a != -1) { int b = compgrRecog(rels, word1, word2, "dobj", word2[a]); if (b != -1) { int c = compgrRecog(rels, word1, word2, "compound", word2[b]); if (c != -1) { System.out.println(word1[q[0]] + "," + word1[c] + "," + word2[c]); } } } //RULE 3:::::::::: else { int b[] = toIntArray(grRecog(rels, "ccomp")); if (b.length != 0) { System.out.println(word1[q[1]] + "," + word2[q[1]] + "," + word1[b[0]]); } } } //RULE 4:::::::::: else if (posrecog(token, pos, word1[q[0]]).equals("VBZ")) { //System.out.println("aaaaa"); int vbp_dobj_index = compgrRecog(rels, word1, word2, "dobj", word2[q[0]]); if (vbp_dobj_index != -1) { System.out.println(word1[vbp_dobj_index] + "," + word2[vbp_dobj_index]); } else { int vbp_xcomp_index = compgrRecog(rels, word1, word2, "xcomp", word1[q[0]]); if (vbp_xcomp_index != -1) { System.out.println(word1[vbp_xcomp_index] + "," + word2[vbp_xcomp_index]); } else { int vbp_acomp_index = compgrRecog(rels, word1, word2, "acomp", word1[q[0]]); if (vbp_acomp_index != -1) { System.out.println( word1[q[0]] + "," + word1[vbp_acomp_index] + "," + word2[vbp_acomp_index]); } else System.out.println(word1[q[0]]); } } } int[] f = toIntArray(grRecog(rels, "amod")); if (f.length != 0) { for (int i : f) { System.out.println(word1[i] + "," + word2[i]); } int cj[] = toIntArray(grRecog(rels, "conj:and")); if (cj.length != 0) { for (int i : cj) { System.out.println(word1[i] + "," + word2[i]); } } } int[] neg = toIntArray(grRecog(rels, "neg")); if (neg.length != 0) { for (int i : neg) { System.out.println(word1[i] + "," + word2[i]); } } } else { int[] f = toIntArray(grRecog(rels, "amod")); if (f.length != 0) { for (int i : f) { System.out.print(word1[i] + "," + word2[i]); String qwe = word1[i] + "," + word2[i]; } int cj[] = toIntArray(grRecog(rels, "conj:and")); if (cj.length != 0) { for (int i : cj) { System.out.println(word2[i]); } } } int[] neg = toIntArray(grRecog(rels, "neg")); if (neg.length != 0) { for (int i : neg) { System.out.println(word1[i] + "," + word2[i]); } } } //RULE 2::::::::::::: } // text=sc.nextLine(); //} }
From source file:DependencyParserDemo.java
public static void main(String[] args) { String modelPath = DependencyParser.DEFAULT_MODEL; String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"; for (int argIndex = 0; argIndex < args.length;) { switch (args[argIndex]) { case "-tagger": taggerPath = args[argIndex + 1]; argIndex += 2;//from w w w. ja v a 2s. c o m break; case "-model": modelPath = args[argIndex + 1]; argIndex += 2; break; default: throw new RuntimeException("Unknown argument " + args[argIndex]); } } String text = "I can almost always tell when movies use fake dinosaurs."; MaxentTagger tagger = new MaxentTagger(taggerPath); DependencyParser parser = DependencyParser.loadFromModelFile(modelPath); DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); for (List<HasWord> sentence : tokenizer) { List<TaggedWord> tagged = tagger.tagSentence(sentence); GrammaticalStructure gs = parser.predict(tagged); // Print typed dependencies System.err.println(gs); } }
From source file:Dependency2.java
public static void main(String[] args) { String modelPath = DependencyParser.DEFAULT_MODEL; String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"; Scanner sc = new Scanner(System.in); readCsv();/*from w w w.j a v a 2 s . co m*/ String text = ""; text = sc.nextLine(); if (multifeatures(text)) { System.out.println("Multiple features present"); MaxentTagger tagger = new MaxentTagger(taggerPath); DependencyParser parser = DependencyParser.loadFromModelFile(modelPath); DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); for (List<HasWord> sentence : tokenizer) { List<TaggedWord> tagged = tagger.tagSentence(sentence); GrammaticalStructure gs = parser.predict(tagged); Collection<TypedDependency> s = gs.typedDependenciesCollapsedTree(); Map<Character, Pair<Character, Character>> map = new HashMap<Character, Pair<Character, Character>>(); Object[] z = s.toArray(); String rels[] = new String[z.length]; String word1[] = new String[z.length]; String word2[] = new String[z.length]; int j = 0; String f, f1, f2; for (Object i : z) { //System.out.println(i); String temp = i.toString(); System.out.println(temp); String pattern0 = "(.*)(?=\\()"; String pattern1 = "(?<=\\()(.*?)(?=-)"; String pattern2 = "(?<=,)(.*)(?=-)"; Pattern r0 = Pattern.compile(pattern0); Pattern r1 = Pattern.compile(pattern1); Pattern r2 = Pattern.compile(pattern2); Matcher m0 = r0.matcher(temp); Matcher m1 = r1.matcher(temp); Matcher m2 = r2.matcher(temp); if (m0.find()) rels[j] = m0.group(0); if (m1.find()) word1[j] = m1.group(0); if (m2.find()) word2[j] = m2.group(0); if (rels[j].equals("amod")) { f1 = getFeature(word1[j]); f2 = getFeature(word2[j]); f = f1 != null ? (f1) : (f2 != null ? f2 : null); if (f != null) { System.out.println("Feature: " + f); } } j++; } //System.out.println(Arrays.toString(rels)); } } else { //sentence score is feature score } }
From source file:TokenizeCorpus.java
public static void main(String[] args) throws IOException { String line = new String(""); String outfile = new String("/Users/nicole/Desktop/outputTokenizer"); FileWriter out = new FileWriter(outfile); for (String arg : args) { // option #1: By sentence. DocumentPreprocessor dp = new DocumentPreprocessor(arg); for (List<HasWord> sentence : dp) { for (HasWord word : sentence) { line = line.concat(word + " "); }// w w w . j a va 2 s. co m System.out.println(sentence); out.append(line); out.append(System.lineSeparator()); line = ""; } out.flush(); out.close(); } }
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>/*from w ww .ja v a 2 s . c om*/ * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:Anaphora_Resolution.ParseAllXMLDocuments.java
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException, TransformerException { // File dataFolder = new File("DataToPort"); // File[] documents; String grammar = "grammar/englishPCFG.ser.gz"; String[] options = { "-maxLength", "100", "-retainTmpSubcategories" }; //LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); ///*from w w w.j ava 2 s . c o m*/ // if (dataFolder.isDirectory()) { // documents = dataFolder.listFiles(); // } else { // documents = new File[] {dataFolder}; // } // int currfile = 0; // int totfiles = documents.length; // for (File paper : documents) { // currfile++; // if (paper.getName().equals(".DS_Store")||paper.getName().equals(".xml")) { // currfile--; // totfiles--; // continue; // } // System.out.println("Working on "+paper.getName()+" (file "+currfile+" out of "+totfiles+")."); // // DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); // This is for XML // DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); // Document doc = docBuilder.parse(paper.getAbsolutePath()); // // NodeList textlist = doc.getElementsByTagName("text"); // for(int i=0; i < textlist.getLength(); i++) { // Node currentnode = textlist.item(i); // String wholetext = textlist.item(i).getTextContent(); String wholetext = "How about other changes for example the ways of doing the work and \n" + "\n" + "Iwould say security has , there 's more pressure put on people now than there used to be because obviously , especially after Locherbie , they tightened up on security and there 's a lot more pressure now especially from the ETR and stuff like that \n" + "People do n't feel valued any more , they feel I do n't know I think they feel that nobody cares about them really anyway \n"; //System.out.println(wholetext); //Iterable<List<? extends HasWord>> sentences; ArrayList<Tree> parseTrees = new ArrayList<Tree>(); String asd = ""; int j = 0; StringReader stringreader = new StringReader(wholetext); DocumentPreprocessor dp = new DocumentPreprocessor(stringreader); @SuppressWarnings("rawtypes") ArrayList<List> sentences = preprocess(dp); for (List sentence : sentences) { parseTrees.add(lp.apply(sentence)); // Parsing a new sentence and adding it to the parsed tree ArrayList<Tree> PronounsList = findPronouns(parseTrees.get(j)); // Locating all pronouns to resolve in the sentence Tree corefedTree; for (Tree pronounTree : PronounsList) { parseTrees.set(parseTrees.size() - 1, HobbsResolve(pronounTree, parseTrees)); // Resolving the coref and modifying the tree for each pronoun } StringWriter strwr = new StringWriter(); PrintWriter prwr = new PrintWriter(strwr); TreePrint tp = new TreePrint("penn"); tp.printTree(parseTrees.get(j), prwr); prwr.flush(); asd += strwr.toString(); j++; } String armando = ""; for (Tree sentence : parseTrees) { for (Tree leaf : Trees.leaves(sentence)) armando += leaf + " "; } System.out.println(wholetext); System.out.println(); System.out.println("......"); System.out.println(armando); System.out.println("All done."); // currentnode.setTextContent(asd); // } // TransformerFactory transformerFactory = TransformerFactory.newInstance(); // Transformer transformer = transformerFactory.newTransformer(); // DOMSource source = new DOMSource(doc); // StreamResult result = new StreamResult(paper); // transformer.transform(source, result); // // System.out.println("Done"); // } }
From source file:com.diskoverorta.osdep.StanfordNLP.java
License:Apache License
public List<String> splitSentencesINDocument(String sDoc) { Reader reader = new StringReader(sDoc); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); Iterator<List<HasWord>> it = dp.iterator(); while (it.hasNext()) { StringBuilder sentenceSb = new StringBuilder(); List<HasWord> sentence = it.next(); for (HasWord token : sentence) { if (sentenceSb.length() > 1) { sentenceSb.append(" "); }/* w ww . ja va 2s. c o m*/ sentenceSb.append(token); } sentenceList.add(sentenceSb.toString().trim()); } return sentenceList; }
From source file:com.epictodo.controller.lexer.NLPSentenceLexer.java
License:Open Source License
@Override public List<Sentence> tokenize(String _text) { List<Sentence> _sentences = new ArrayList<>(); final DocumentPreprocessor document_preprocessor = new DocumentPreprocessor(new StringReader(_text)); for (List<HasWord> sentence : document_preprocessor) { _sentences.add(new Sentence(sentence.toString())); }/*w ww . j av a 2s . co m*/ return _sentences; }
From source file:com.github.kutschkem.Qgen.QuestionRankerByParseProbability.java
License:Open Source License
public List<Question> rank(List<Question> questions) { System.out.println("afterPipeline:" + questions.size()); final Map<Question, Double> scores = new HashMap<Question, Double>(); for (Question q : questions) { List<HasWord> tokens = new DocumentPreprocessor(new StringReader(q.Question)).iterator().next(); LexicalizedParserQuery query = parser.parserQuery(); query.parse(tokens);//from ww w.j a v a 2s . com scores.put(q, average(query.getKBestPCFGParses(3))); } List<Question> result = new ArrayList<Question>(questions); Collections.sort(result, new Comparator<Question>() { public int compare(Question o1, Question o2) { return -scores.get(o1).compareTo(scores.get(o2)); } }); for (Question q : result) { System.out.println(q.Question + " " + scores.get(q)); } return result; }
From source file:com.me.edu.Servlet.ElasticSearch_Backup.java
public static String getSentence(String input) { String paragraph = input;//from w ww. j a v a2s . co m Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); for (List<HasWord> sentence : dp) { String sentenceString = Sentence.listToString(sentence); sentenceList.add(sentenceString.toString()); } String sent = ""; for (String sentence : sentenceList) { System.out.println(sentence); sent = sent + " " + sentence + "\n"; } try { FileWriter file = new FileWriter("Sentences.txt"); file.write(sent.toString()); file.flush(); file.close(); } catch (IOException e) { e.printStackTrace(); } return sent; }