Example usage for edu.stanford.nlp.process DocumentPreprocessor DocumentPreprocessor

List of usage examples for edu.stanford.nlp.process DocumentPreprocessor DocumentPreprocessor

Introduction

In this page you can find the example usage for edu.stanford.nlp.process DocumentPreprocessor DocumentPreprocessor.

Prototype

public DocumentPreprocessor(String docPath) 

Source Link

Usage

From source file:Dependency.java

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
    Scanner sc = new Scanner(System.in);

    String text = "";
    text = sc.nextLine();//from   w  w  w.ja  v  a 2s .c  om
    // while(text!="exit"){

    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {

        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        Object[] x = tagged.toArray();
        GrammaticalStructure gs = parser.predict(tagged);
        //System.out.println();

        Collection<TypedDependency> s = gs.typedDependenciesCollapsedTree();
        Object[] z = s.toArray();

        System.out.println(tagged.toString());
        String token[] = new String[z.length];
        String pos[] = new String[z.length];
        int k = 0;
        for (Object i : x) {
            String str = i.toString();
            /*String temp0="(.*?)(?=\\/)";
            String temp1="\\/(.*)";
                    
            System.out.println(str);
            Pattern t0 = Pattern.compile("(.*?)(?=\\/)");
            Pattern t1 = Pattern.compile("\\/(.*)");
            Matcher m0 = t0.matcher(str);
            Matcher m1 = t1.matcher(str);*/
            int index = str.lastIndexOf('/');
            token[k] = str.substring(0, index);
            pos[k] = str.substring(index + 1);
            //System.out.println(pos[k]);
            k++;
        }
        String rels[] = new String[z.length];
        String word1[] = new String[z.length];
        String word2[] = new String[z.length];
        int j = 0;
        for (Object i : z) {
            System.out.println(i);
            String temp = i.toString();
            String pattern0 = "(.*)(?=\\()";
            String pattern1 = "(?<=\\()(.*?)(?=-)";
            String pattern2 = "(?<=, )(.*)(?=-)";
            Pattern r0 = Pattern.compile(pattern0);
            Pattern r1 = Pattern.compile(pattern1);
            Pattern r2 = Pattern.compile(pattern2);
            Matcher m0 = r0.matcher(temp);
            Matcher m1 = r1.matcher(temp);
            Matcher m2 = r2.matcher(temp);
            if (m0.find()) {
                rels[j] = m0.group(0);
                //System.out.println(rels[j]);
            }
            if (m1.find()) {
                word1[j] = m1.group(0);
            }
            if (m2.find()) {
                word2[j] = m2.group(0);
            }
            j++;
        }
        //System.out.println(s);
        //Rules for feature extraction.
        //rule1:::::::::::::::::
        //System.out.println("1");
        int[] q = toIntArray(grRecog(rels, "nsubj"));
        //System.out.println("2");
        if (q.length != 0) {
            //System.out.println("3");
            if (posrecog(token, pos, word2[q[0]]).equals("NN")) {
                //System.out.println("4");
                int[] w = toIntArray(grRecog(rels, "compound"));
                //System.out.println("5");
                if (w.length != 0) {
                    System.out.println("6");
                    System.out.println(word1[q[0]] + "," + word2[q[0]] + "," + word2[w[0]]);
                } else {
                    int conj_and_index = compgrRecog(rels, word1, word2, "conj:and", word2[q[0]]);
                    if (conj_and_index != -1) {
                        System.out.println(
                                word1[conj_and_index] + "," + word2[conj_and_index] + "," + word2[q[0]]);
                    } else
                        System.out.println(word1[q[0]] + "," + word2[q[0]]);
                }
            }
            //RULE 2:::::::::::::
            else if (posrecog(token, pos, word1[q[0]]).equals("JJ")) {
                //System.out.println("aaaaa_JJ");
                int a = compgrRecog(rels, word1, word2, "xcomp", word1[q[0]]);
                if (a != -1) {
                    int b = compgrRecog(rels, word1, word2, "dobj", word2[a]);
                    if (b != -1) {
                        int c = compgrRecog(rels, word1, word2, "compound", word2[b]);
                        if (c != -1) {
                            System.out.println(word1[q[0]] + "," + word1[c] + "," + word2[c]);
                        }
                    }
                }
                //RULE 3::::::::::
                else {
                    int b[] = toIntArray(grRecog(rels, "ccomp"));
                    if (b.length != 0) {
                        System.out.println(word1[q[1]] + "," + word2[q[1]] + "," + word1[b[0]]);
                    }

                }
            }
            //RULE 4::::::::::
            else if (posrecog(token, pos, word1[q[0]]).equals("VBZ")) {
                //System.out.println("aaaaa");
                int vbp_dobj_index = compgrRecog(rels, word1, word2, "dobj", word2[q[0]]);
                if (vbp_dobj_index != -1) {
                    System.out.println(word1[vbp_dobj_index] + "," + word2[vbp_dobj_index]);
                } else {
                    int vbp_xcomp_index = compgrRecog(rels, word1, word2, "xcomp", word1[q[0]]);
                    if (vbp_xcomp_index != -1) {

                        System.out.println(word1[vbp_xcomp_index] + "," + word2[vbp_xcomp_index]);
                    } else {
                        int vbp_acomp_index = compgrRecog(rels, word1, word2, "acomp", word1[q[0]]);
                        if (vbp_acomp_index != -1) {

                            System.out.println(
                                    word1[q[0]] + "," + word1[vbp_acomp_index] + "," + word2[vbp_acomp_index]);
                        } else
                            System.out.println(word1[q[0]]);

                    }

                }

            }
            int[] f = toIntArray(grRecog(rels, "amod"));
            if (f.length != 0) {
                for (int i : f) {
                    System.out.println(word1[i] + "," + word2[i]);
                }
                int cj[] = toIntArray(grRecog(rels, "conj:and"));
                if (cj.length != 0) {
                    for (int i : cj) {
                        System.out.println(word1[i] + "," + word2[i]);
                    }
                }
            }
            int[] neg = toIntArray(grRecog(rels, "neg"));
            if (neg.length != 0) {
                for (int i : neg) {
                    System.out.println(word1[i] + "," + word2[i]);
                }

            }

        } else {
            int[] f = toIntArray(grRecog(rels, "amod"));
            if (f.length != 0) {
                for (int i : f) {
                    System.out.print(word1[i] + "," + word2[i]);
                    String qwe = word1[i] + "," + word2[i];
                }
                int cj[] = toIntArray(grRecog(rels, "conj:and"));
                if (cj.length != 0) {
                    for (int i : cj) {
                        System.out.println(word2[i]);

                    }
                }
            }
            int[] neg = toIntArray(grRecog(rels, "neg"));
            if (neg.length != 0) {
                for (int i : neg) {
                    System.out.println(word1[i] + "," + word2[i]);
                }

            }

        }

        //RULE 2:::::::::::::

    }

    //  text=sc.nextLine();
    //}
}

From source file:DependencyParserDemo.java

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

    for (int argIndex = 0; argIndex < args.length;) {
        switch (args[argIndex]) {
        case "-tagger":
            taggerPath = args[argIndex + 1];
            argIndex += 2;//from w w w.  ja v a 2s. c o m
            break;
        case "-model":
            modelPath = args[argIndex + 1];
            argIndex += 2;
            break;
        default:
            throw new RuntimeException("Unknown argument " + args[argIndex]);
        }
    }

    String text = "I can almost always tell when movies use fake dinosaurs.";

    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        GrammaticalStructure gs = parser.predict(tagged);

        // Print typed dependencies
        System.err.println(gs);
    }
}

From source file:Dependency2.java

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
    Scanner sc = new Scanner(System.in);

    readCsv();/*from   w  w  w.j  a v  a 2 s  . co  m*/
    String text = "";
    text = sc.nextLine();
    if (multifeatures(text)) {
        System.out.println("Multiple features present");
        MaxentTagger tagger = new MaxentTagger(taggerPath);
        DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

        DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
        for (List<HasWord> sentence : tokenizer) {
            List<TaggedWord> tagged = tagger.tagSentence(sentence);
            GrammaticalStructure gs = parser.predict(tagged);

            Collection<TypedDependency> s = gs.typedDependenciesCollapsedTree();
            Map<Character, Pair<Character, Character>> map = new HashMap<Character, Pair<Character, Character>>();
            Object[] z = s.toArray();
            String rels[] = new String[z.length];
            String word1[] = new String[z.length];
            String word2[] = new String[z.length];
            int j = 0;
            String f, f1, f2;
            for (Object i : z) {
                //System.out.println(i);
                String temp = i.toString();
                System.out.println(temp);
                String pattern0 = "(.*)(?=\\()";
                String pattern1 = "(?<=\\()(.*?)(?=-)";
                String pattern2 = "(?<=,)(.*)(?=-)";
                Pattern r0 = Pattern.compile(pattern0);
                Pattern r1 = Pattern.compile(pattern1);
                Pattern r2 = Pattern.compile(pattern2);
                Matcher m0 = r0.matcher(temp);
                Matcher m1 = r1.matcher(temp);
                Matcher m2 = r2.matcher(temp);
                if (m0.find())
                    rels[j] = m0.group(0);
                if (m1.find())
                    word1[j] = m1.group(0);
                if (m2.find())
                    word2[j] = m2.group(0);
                if (rels[j].equals("amod")) {
                    f1 = getFeature(word1[j]);
                    f2 = getFeature(word2[j]);
                    f = f1 != null ? (f1) : (f2 != null ? f2 : null);
                    if (f != null) {
                        System.out.println("Feature: " + f);

                    }

                }

                j++;
            }
            //System.out.println(Arrays.toString(rels));
        }
    } else {
        //sentence score is feature score
    }

}

From source file:TokenizeCorpus.java

public static void main(String[] args) throws IOException {
    String line = new String("");
    String outfile = new String("/Users/nicole/Desktop/outputTokenizer");
    FileWriter out = new FileWriter(outfile);

    for (String arg : args) {
        // option #1: By sentence.
        DocumentPreprocessor dp = new DocumentPreprocessor(arg);
        for (List<HasWord> sentence : dp) {
            for (HasWord word : sentence) {
                line = line.concat(word + " ");
            }// w w w  .  j  a  va  2 s. co  m

            System.out.println(sentence);
            out.append(line);
            out.append(System.lineSeparator());
            line = "";

        }

        out.flush();
        out.close();
    }
}

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*from w  ww .ja v  a  2  s . c om*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:Anaphora_Resolution.ParseAllXMLDocuments.java

public static void main(String[] args)
        throws IOException, SAXException, ParserConfigurationException, TransformerException {
    //      File dataFolder = new File("DataToPort");
    //      File[] documents;
    String grammar = "grammar/englishPCFG.ser.gz";
    String[] options = { "-maxLength", "100", "-retainTmpSubcategories" };
    //LexicalizedParser lp =  new LexicalizedParser(grammar, options);
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    ///*from  w  w w.j  ava  2 s .  c  o  m*/
    //      if (dataFolder.isDirectory()) {
    //          documents = dataFolder.listFiles();
    //      } else {
    //          documents = new File[] {dataFolder};
    //      }
    //      int currfile = 0;
    //      int totfiles = documents.length;
    //      for (File paper : documents) {
    //          currfile++;
    //          if (paper.getName().equals(".DS_Store")||paper.getName().equals(".xml")) {
    //              currfile--;
    //              totfiles--;
    //              continue;
    //          }
    //          System.out.println("Working on "+paper.getName()+" (file "+currfile+" out of "+totfiles+").");
    //
    //          DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); // This is for XML
    //          DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
    //          Document doc = docBuilder.parse(paper.getAbsolutePath());
    //
    //          NodeList textlist = doc.getElementsByTagName("text");
    //          for(int i=0; i < textlist.getLength(); i++) {
    //              Node currentnode = textlist.item(i);
    //              String wholetext = textlist.item(i).getTextContent();
    String wholetext = "How about other changes for example the ways of doing the work and \n" + "\n"
            + "Iwould say security has , there 's more pressure put on people now than there used to be because obviously , especially after Locherbie , they tightened up on security and there 's a lot more pressure now especially from the ETR and stuff like that \n"
            + "People do n't feel valued any more , they feel  I do n't know I think they feel that nobody cares about them really anyway \n";

    //System.out.println(wholetext);
    //Iterable<List<? extends HasWord>> sentences;

    ArrayList<Tree> parseTrees = new ArrayList<Tree>();
    String asd = "";
    int j = 0;
    StringReader stringreader = new StringReader(wholetext);
    DocumentPreprocessor dp = new DocumentPreprocessor(stringreader);
    @SuppressWarnings("rawtypes")
    ArrayList<List> sentences = preprocess(dp);
    for (List sentence : sentences) {
        parseTrees.add(lp.apply(sentence)); // Parsing a new sentence and adding it to the parsed tree
        ArrayList<Tree> PronounsList = findPronouns(parseTrees.get(j)); // Locating all pronouns to resolve in the sentence
        Tree corefedTree;
        for (Tree pronounTree : PronounsList) {
            parseTrees.set(parseTrees.size() - 1, HobbsResolve(pronounTree, parseTrees)); // Resolving the coref and modifying the tree for each pronoun
        }
        StringWriter strwr = new StringWriter();
        PrintWriter prwr = new PrintWriter(strwr);
        TreePrint tp = new TreePrint("penn");
        tp.printTree(parseTrees.get(j), prwr);
        prwr.flush();
        asd += strwr.toString();
        j++;
    }
    String armando = "";
    for (Tree sentence : parseTrees) {
        for (Tree leaf : Trees.leaves(sentence))
            armando += leaf + " ";
    }
    System.out.println(wholetext);

    System.out.println();
    System.out.println("......");
    System.out.println(armando);
    System.out.println("All done.");
    //              currentnode.setTextContent(asd);
    //          }
    //          TransformerFactory transformerFactory = TransformerFactory.newInstance();
    //          Transformer transformer = transformerFactory.newTransformer();
    //          DOMSource source = new DOMSource(doc);
    //          StreamResult result = new StreamResult(paper);
    //          transformer.transform(source, result);
    //
    //          System.out.println("Done");
    //      }
}

From source file:com.diskoverorta.osdep.StanfordNLP.java

License:Apache License

public List<String> splitSentencesINDocument(String sDoc) {
    Reader reader = new StringReader(sDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }/*  w ww . ja va  2s. c o  m*/
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString().trim());
    }
    return sentenceList;
}

From source file:com.epictodo.controller.lexer.NLPSentenceLexer.java

License:Open Source License

@Override
public List<Sentence> tokenize(String _text) {
    List<Sentence> _sentences = new ArrayList<>();
    final DocumentPreprocessor document_preprocessor = new DocumentPreprocessor(new StringReader(_text));

    for (List<HasWord> sentence : document_preprocessor) {
        _sentences.add(new Sentence(sentence.toString()));
    }/*w  ww  .  j av a  2s .  co  m*/

    return _sentences;
}

From source file:com.github.kutschkem.Qgen.QuestionRankerByParseProbability.java

License:Open Source License

public List<Question> rank(List<Question> questions) {

    System.out.println("afterPipeline:" + questions.size());

    final Map<Question, Double> scores = new HashMap<Question, Double>();
    for (Question q : questions) {
        List<HasWord> tokens = new DocumentPreprocessor(new StringReader(q.Question)).iterator().next();

        LexicalizedParserQuery query = parser.parserQuery();
        query.parse(tokens);//from  ww w.j a v  a 2s  .  com
        scores.put(q, average(query.getKBestPCFGParses(3)));
    }

    List<Question> result = new ArrayList<Question>(questions);

    Collections.sort(result, new Comparator<Question>() {

        public int compare(Question o1, Question o2) {
            return -scores.get(o1).compareTo(scores.get(o2));
        }

    });

    for (Question q : result) {
        System.out.println(q.Question + " " + scores.get(q));
    }

    return result;
}

From source file:com.me.edu.Servlet.ElasticSearch_Backup.java

public static String getSentence(String input) {
    String paragraph = input;//from w ww.  j  a  v  a2s  .  co  m
    Reader reader = new StringReader(paragraph);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();

    for (List<HasWord> sentence : dp) {
        String sentenceString = Sentence.listToString(sentence);
        sentenceList.add(sentenceString.toString());
    }
    String sent = "";
    for (String sentence : sentenceList) {
        System.out.println(sentence);
        sent = sent + " " + sentence + "\n";
    }
    try {

        FileWriter file = new FileWriter("Sentences.txt");
        file.write(sent.toString());
        file.flush();
        file.close();

    } catch (IOException e) {
        e.printStackTrace();
    }
    return sent;
}