Example usage for edu.stanford.nlp.process DocumentPreprocessor DocumentPreprocessor

Introduction

In this page you can find the example usage for edu.stanford.nlp.process DocumentPreprocessor DocumentPreprocessor.

Prototype

public DocumentPreprocessor(String docPath)

Source Link

Usage

From source file:Dependency.java

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
    Scanner sc = new Scanner(System.in);

    String text = "";
    text = sc.nextLine();//from   w  w  w.ja  v  a 2s .c  om
    // while(text!="exit"){

    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {

        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        Object[] x = tagged.toArray();
        GrammaticalStructure gs = parser.predict(tagged);
        //System.out.println();

        Collection<TypedDependency> s = gs.typedDependenciesCollapsedTree();
        Object[] z = s.toArray();

        System.out.println(tagged.toString());
        String token[] = new String[z.length];
        String pos[] = new String[z.length];
        int k = 0;
        for (Object i : x) {
            String str = i.toString();
            /*String temp0="(.*?)(?=\\/)";
            String temp1="\\/(.*)";
                    
            System.out.println(str);
            Pattern t0 = Pattern.compile("(.*?)(?=\\/)");
            Pattern t1 = Pattern.compile("\\/(.*)");
            Matcher m0 = t0.matcher(str);
            Matcher m1 = t1.matcher(str);*/
            int index = str.lastIndexOf('/');
            token[k] = str.substring(0, index);
            pos[k] = str.substring(index + 1);
            //System.out.println(pos[k]);
            k++;
        }
        String rels[] = new String[z.length];
        String word1[] = new String[z.length];
        String word2[] = new String[z.length];
        int j = 0;
        for (Object i : z) {
            System.out.println(i);
            String temp = i.toString();
            String pattern0 = "(.*)(?=\\()";
            String pattern1 = "(?<=\\()(.*?)(?=-)";
            String pattern2 = "(?<=, )(.*)(?=-)";
            Pattern r0 = Pattern.compile(pattern0);
            Pattern r1 = Pattern.compile(pattern1);
            Pattern r2 = Pattern.compile(pattern2);
            Matcher m0 = r0.matcher(temp);
            Matcher m1 = r1.matcher(temp);
            Matcher m2 = r2.matcher(temp);
            if (m0.find()) {
                rels[j] = m0.group(0);
                //System.out.println(rels[j]);
            }
            if (m1.find()) {
                word1[j] = m1.group(0);
            }
            if (m2.find()) {
                word2[j] = m2.group(0);
            }
            j++;
        }
        //System.out.println(s);
        //Rules for feature extraction.
        //rule1:::::::::::::::::
        //System.out.println("1");
        int[] q = toIntArray(grRecog(rels, "nsubj"));
        //System.out.println("2");
        if (q.length != 0) {
            //System.out.println("3");
            if (posrecog(token, pos, word2[q[0]]).equals("NN")) {
                //System.out.println("4");
                int[] w = toIntArray(grRecog(rels, "compound"));
                //System.out.println("5");
                if (w.length != 0) {
                    System.out.println("6");
                    System.out.println(word1[q[0]] + "," + word2[q[0]] + "," + word2[w[0]]);
                } else {
                    int conj_and_index = compgrRecog(rels, word1, word2, "conj:and", word2[q[0]]);
                    if (conj_and_index != -1) {
                        System.out.println(
                                word1[conj_and_index] + "," + word2[conj_and_index] + "," + word2[q[0]]);
                    } else
                        System.out.println(word1[q[0]] + "," + word2[q[0]]);
                }
            }
            //RULE 2:::::::::::::
            else if (posrecog(token, pos, word1[q[0]]).equals("JJ")) {
                //System.out.println("aaaaa_JJ");
                int a = compgrRecog(rels, word1, word2, "xcomp", word1[q[0]]);
                if (a != -1) {
                    int b = compgrRecog(rels, word1, word2, "dobj", word2[a]);
                    if (b != -1) {
                        int c = compgrRecog(rels, word1, word2, "compound", word2[b]);
                        if (c != -1) {
                            System.out.println(word1[q[0]] + "," + word1[c] + "," + word2[c]);
                        }
                    }
                }
                //RULE 3::::::::::
                else {
                    int b[] = toIntArray(grRecog(rels, "ccomp"));
                    if (b.length != 0) {
                        System.out.println(word1[q[1]] + "," + word2[q[1]] + "," + word1[b[0]]);
                    }

                }
            }
            //RULE 4::::::::::
            else if (posrecog(token, pos, word1[q[0]]).equals("VBZ")) {
                //System.out.println("aaaaa");
                int vbp_dobj_index = compgrRecog(rels, word1, word2, "dobj", word2[q[0]]);
                if (vbp_dobj_index != -1) {
                    System.out.println(word1[vbp_dobj_index] + "," + word2[vbp_dobj_index]);
                } else {
                    int vbp_xcomp_index = compgrRecog(rels, word1, word2, "xcomp", word1[q[0]]);
                    if (vbp_xcomp_index != -1) {

                        System.out.println(word1[vbp_xcomp_index] + "," + word2[vbp_xcomp_index]);
                    } else {
                        int vbp_acomp_index = compgrRecog(rels, word1, word2, "acomp", word1[q[0]]);
                        if (vbp_acomp_index != -1) {

                            System.out.println(
                                    word1[q[0]] + "," + word1[vbp_acomp_index] + "," + word2[vbp_acomp_index]);
                        } else
                            System.out.println(word1[q[0]]);

                    }

                }

            }
            int[] f = toIntArray(grRecog(rels, "amod"));
            if (f.length != 0) {
                for (int i : f) {
                    System.out.println(word1[i] + "," + word2[i]);
                }
                int cj[] = toIntArray(grRecog(rels, "conj:and"));
                if (cj.length != 0) {
                    for (int i : cj) {
                        System.out.println(word1[i] + "," + word2[i]);
                    }
                }
            }
            int[] neg = toIntArray(grRecog(rels, "neg"));
            if (neg.length != 0) {
                for (int i : neg) {
                    System.out.println(word1[i] + "," + word2[i]);
                }

            }

        } else {
            int[] f = toIntArray(grRecog(rels, "amod"));
            if (f.length != 0) {
                for (int i : f) {
                    System.out.print(word1[i] + "," + word2[i]);
                    String qwe = word1[i] + "," + word2[i];
                }
                int cj[] = toIntArray(grRecog(rels, "conj:and"));
                if (cj.length != 0) {
                    for (int i : cj) {
                        System.out.println(word2[i]);

                    }
                }
            }
            int[] neg = toIntArray(grRecog(rels, "neg"));
            if (neg.length != 0) {
                for (int i : neg) {
                    System.out.println(word1[i] + "," + word2[i]);
                }

            }

        }

        //RULE 2:::::::::::::

    }

    //  text=sc.nextLine();
    //}
}

From source file:DependencyParserDemo.java

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

    for (int argIndex = 0; argIndex < args.length;) {
        switch (args[argIndex]) {
        case "-tagger":
            taggerPath = args[argIndex + 1];
            argIndex += 2;//from w w w.  ja v a 2s. c o m
            break;
        case "-model":
            modelPath = args[argIndex + 1];
            argIndex += 2;
            break;
        default:
            throw new RuntimeException("Unknown argument " + args[argIndex]);
        }
    }

    String text = "I can almost always tell when movies use fake dinosaurs.";

    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        GrammaticalStructure gs = parser.predict(tagged);

        // Print typed dependencies
        System.err.println(gs);
    }
}

From source file:Dependency2.java

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
    Scanner sc = new Scanner(System.in);

    readCsv();/*from   w  w  w.j  a v  a 2 s  . co  m*/
    String text = "";
    text = sc.nextLine();
    if (multifeatures(text)) {
        System.out.println("Multiple features present");
        MaxentTagger tagger = new MaxentTagger(taggerPath);
        DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

        DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
        for (List<HasWord> sentence : tokenizer) {
            List<TaggedWord> tagged = tagger.tagSentence(sentence);
            GrammaticalStructure gs = parser.predict(tagged);

            Collection<TypedDependency> s = gs.typedDependenciesCollapsedTree();
            Map<Character, Pair<Character, Character>> map = new HashMap<Character, Pair<Character, Character>>();
            Object[] z = s.toArray();
            String rels[] = new String[z.length];
            String word1[] = new String[z.length];
            String word2[] = new String[z.length];
            int j = 0;
            String f, f1, f2;
            for (Object i : z) {
                //System.out.println(i);
                String temp = i.toString();
                System.out.println(temp);
                String pattern0 = "(.*)(?=\\()";
                String pattern1 = "(?<=\\()(.*?)(?=-)";
                String pattern2 = "(?<=,)(.*)(?=-)";
                Pattern r0 = Pattern.compile(pattern0);
                Pattern r1 = Pattern.compile(pattern1);
                Pattern r2 = Pattern.compile(pattern2);
                Matcher m0 = r0.matcher(temp);
                Matcher m1 = r1.matcher(temp);
                Matcher m2 = r2.matcher(temp);
                if (m0.find())
                    rels[j] = m0.group(0);
                if (m1.find())
                    word1[j] = m1.group(0);
                if (m2.find())
                    word2[j] = m2.group(0);
                if (rels[j].equals("amod")) {
                    f1 = getFeature(word1[j]);
                    f2 = getFeature(word2[j]);
                    f = f1 != null ? (f1) : (f2 != null ? f2 : null);
                    if (f != null) {
                        System.out.println("Feature: " + f);

                    }

                }

                j++;
            }
            //System.out.println(Arrays.toString(rels));
        }
    } else {
        //sentence score is feature score
    }

}

From source file:TokenizeCorpus.java

public static void main(String[] args) throws IOException {
    String line = new String("");
    String outfile = new String("/Users/nicole/Desktop/outputTokenizer");
    FileWriter out = new FileWriter(outfile);

    for (String arg : args) {
        // option #1: By sentence.
        DocumentPreprocessor dp = new DocumentPreprocessor(arg);
        for (List<HasWord> sentence : dp) {
            for (HasWord word : sentence) {
                line = line.concat(word + " ");
            }// w w w  .  j  a  va  2 s. co  m

            System.out.println(sentence);
            out.append(line);
            out.append(System.lineSeparator());
            line = "";

        }

        out.flush();
        out.close();
    }
}

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*from w  ww .ja v  a  2  s . c om*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:Anaphora_Resolution.ParseAllXMLDocuments.java

public static void main(String[] args)
        throws IOException, SAXException, ParserConfigurationException, TransformerException {
    //      File dataFolder = new File("DataToPort");
    //      File[] documents;
    String grammar = "grammar/englishPCFG.ser.gz";
    String[] options = { "-maxLength", "100", "-retainTmpSubcategories" };
    //LexicalizedParser lp =  new LexicalizedParser(grammar, options);
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    ///*from  w  w w.j  ava  2 s .  c  o  m*/
    //      if (dataFolder.isDirectory()) {
    //          documents = dataFolder.listFiles();
    //      } else {
    //          documents = new File[] {dataFolder};
    //      }
    //      int currfile = 0;
    //      int totfiles = documents.length;
    //      for (File paper : documents) {
    //          currfile++;
    //          if (paper.getName().equals(".DS_Store")||paper.getName().equals(".xml")) {
    //              currfile--;
    //              totfiles--;
    //              continue;
    //          }
    //          System.out.println("Working on "+paper.getName()+" (file "+currfile+" out of "+totfiles+").");
    //
    //          DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); // This is for XML
    //          DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
    //          Document doc = docBuilder.parse(paper.getAbsolutePath());
    //
    //          NodeList textlist = doc.getElementsByTagName("text");
    //          for(int i=0; i < textlist.getLength(); i++) {
    //              Node currentnode = textlist.item(i);
    //              String wholetext = textlist.item(i).getTextContent();
    String wholetext = "How about other changes for example the ways of doing the work and \n" + "\n"
            + "Iwould say security has , there 's more pressure put on people now than there used to be because obviously , especially after Locherbie , they tightened up on security and there 's a lot more pressure now especially from the ETR and stuff like that \n"
            + "People do n't feel valued any more , they feel  I do n't know I think they feel that nobody cares about them really anyway \n";

    //System.out.println(wholetext);
    //Iterable<List<? extends HasWord>> sentences;

    ArrayList<Tree> parseTrees = new ArrayList<Tree>();
    String asd = "";
    int j = 0;
    StringReader stringreader = new StringReader(wholetext);
    DocumentPreprocessor dp = new DocumentPreprocessor(stringreader);
    @SuppressWarnings("rawtypes")
    ArrayList<List> sentences = preprocess(dp);
    for (List sentence : sentences) {
        parseTrees.add(lp.apply(sentence)); // Parsing a new sentence and adding it to the parsed tree
        ArrayList<Tree> PronounsList = findPronouns(parseTrees.get(j)); // Locating all pronouns to resolve in the sentence
        Tree corefedTree;
        for (Tree pronounTree : PronounsList) {
            parseTrees.set(parseTrees.size() - 1, HobbsResolve(pronounTree, parseTrees)); // Resolving the coref and modifying the tree for each pronoun
        }
        StringWriter strwr = new StringWriter();
        PrintWriter prwr = new PrintWriter(strwr);
        TreePrint tp = new TreePrint("penn");
        tp.printTree(parseTrees.get(j), prwr);
        prwr.flush();
        asd += strwr.toString();
        j++;
    }
    String armando = "";
    for (Tree sentence : parseTrees) {
        for (Tree leaf : Trees.leaves(sentence))
            armando += leaf + " ";
    }
    System.out.println(wholetext);

    System.out.println();
    System.out.println("......");
    System.out.println(armando);
    System.out.println("All done.");
    //              currentnode.setTextContent(asd);
    //          }
    //          TransformerFactory transformerFactory = TransformerFactory.newInstance();
    //          Transformer transformer = transformerFactory.newTransformer();
    //          DOMSource source = new DOMSource(doc);
    //          StreamResult result = new StreamResult(paper);
    //          transformer.transform(source, result);
    //
    //          System.out.println("Done");
    //      }
}

From source file:com.diskoverorta.osdep.StanfordNLP.java

License:Apache License

public List<String> splitSentencesINDocument(String sDoc) {
    Reader reader = new StringReader(sDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }/*  w ww . ja va  2s. c o  m*/
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString().trim());
    }
    return sentenceList;
}

From source file:com.epictodo.controller.lexer.NLPSentenceLexer.java

License:Open Source License

@Override
public List<Sentence> tokenize(String _text) {
    List<Sentence> _sentences = new ArrayList<>();
    final DocumentPreprocessor document_preprocessor = new DocumentPreprocessor(new StringReader(_text));

    for (List<HasWord> sentence : document_preprocessor) {
        _sentences.add(new Sentence(sentence.toString()));
    }/*w  ww  .  j av a  2s .  co  m*/

    return _sentences;
}

From source file:com.github.kutschkem.Qgen.QuestionRankerByParseProbability.java

License:Open Source License

public List<Question> rank(List<Question> questions) {

    System.out.println("afterPipeline:" + questions.size());

    final Map<Question, Double> scores = new HashMap<Question, Double>();
    for (Question q : questions) {
        List<HasWord> tokens = new DocumentPreprocessor(new StringReader(q.Question)).iterator().next();

        LexicalizedParserQuery query = parser.parserQuery();
        query.parse(tokens);//from  ww w.j a v  a 2s  .  com
        scores.put(q, average(query.getKBestPCFGParses(3)));
    }

    List<Question> result = new ArrayList<Question>(questions);

    Collections.sort(result, new Comparator<Question>() {

        public int compare(Question o1, Question o2) {
            return -scores.get(o1).compareTo(scores.get(o2));
        }

    });

    for (Question q : result) {
        System.out.println(q.Question + " " + scores.get(q));
    }

    return result;
}

From source file:com.me.edu.Servlet.ElasticSearch_Backup.java

public static String getSentence(String input) {
    String paragraph = input;//from w ww.  j  a  v  a2s  .  co  m
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
        String sentenceString = Sentence.listToString(sentence);
        sentenceList.add(sentenceString.toString());
    }
    String sent = "";
    for (String sentence : sentenceList) {
        System.out.println(sentence);
        sent = sent + " " + sentence + "\n";
    }
    try {

        FileWriter file = new FileWriter("Sentences.txt");
        file.write(sent.toString());
        file.flush();
        file.close();

    } catch (IOException e) {
        e.printStackTrace();
    }
    return sent;
}