Example usage for edu.stanford.nlp.process DocumentPreprocessor setTokenizerFactory

List of usage examples for edu.stanford.nlp.process DocumentPreprocessor setTokenizerFactory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process DocumentPreprocessor setTokenizerFactory.

Prototype

public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) 

Source Link

Document

Sets the factory from which to produce a Tokenizer .

Usage

From source file:de.uni_stuttgart.ims.comparatives.nlp.SentenceSplitterStanford.java

License:Creative Commons License

/**
 * Split the string into sentences with Stanford.
 * @return List of spans with the start/end positions of each sentence. 
 *///from  w  w  w  . j  a v  a  2  s.c om
public TextSpan[] split(String document) {
    StringReader reader = new StringReader(document);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    dp.setTokenizerFactory(ptbTokenizerFactory);

    ArrayList<TextSpan> sentenceSpansList = new ArrayList<TextSpan>();
    for (List<HasWord> sent : dp) {
        CoreLabel firstword = (CoreLabel) sent.get(0);
        CoreLabel lastword = (CoreLabel) sent.get(sent.size() - 1);
        String coveredText = "";
        for (int i = 0; i < sent.size(); i++) {
            CoreLabel word = (CoreLabel) sent.get(i);
            coveredText += word.value() + " ";
        }
        sentenceSpansList.add(new TextSpan(firstword.beginPosition(), lastword.endPosition(), coveredText));
    }

    return sentenceSpansList.toArray(new TextSpan[0]);

}

From source file:flight_ranker.TaggerDemo2.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
        return;//from   w  ww  . j  a va 2  s .  c om
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
            "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
    List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",",
            "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
        if (tw.tag().startsWith("JJ")) {
            pw.println(tw.word());
        }
    }

    pw.close();
}

From source file:phrasesentimentextractor.PhraseSentimentExtractor.java

/**
 * @param args the command line arguments
 *//*w  ww.j  a v  a  2 s .  c  o  m*/

public static void main(String[] args) throws FileNotFoundException, IOException {
    // TODO code application logic here

    //Initialize all the models
    //Tokenizer model for the sentence from OpenNLP , tokenizes the sentence
    //        InputStream is = new FileInputStream("en-token.bin");
    //                        
    //        TokenizerModel model = new TokenizerModel(is);
    //   Tokenizer tokenizer = new TokenizerME(model);
    //        
    //        //POS model from OpenNLP, gives the POS tags
    //        POSModel posmodel = new POSModelLoader().load(new File("en-pos-maxent.bin"));
    //        POSTaggerME tagger = new POSTaggerME(posmodel);

    DependencyTreeGenerator dr = DependencyTreeGenerator.getInstance();
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
            "invertible=true");

    //chunker
    Path filepath = Paths.get("models/en-chunker.bin");
    InputStream is = new FileInputStream(filepath.toFile());
    ChunkerModel cModel = new ChunkerModel(is);
    ChunkerME chunkerME = new ChunkerME(cModel);

    //Output file 
    File output_phrases = new File(args[2]);
    FileWriter fout = new FileWriter(output_phrases);
    PrintWriter out = new PrintWriter(fout);

    //Start processing the review file

    //Extract all the features
    Set<String> features = new HashSet();
    HashMap<String, List<String>> featuresPhrases = new HashMap();

    File feat_input = new File(args[0]);
    Scanner scanner = new Scanner(feat_input);
    int feat_counter = 0;
    String feat = "";
    while (scanner.hasNext()) {
        feat = scanner.nextLine().trim();
        features.add(feat);
        List<String> f_phrases = new ArrayList();
        featuresPhrases.put(feat, f_phrases);
        feat_counter++;
    }
    String sentence = "";

    File review_text = new File(args[1]);
    FileReader fileReader = new FileReader(review_text);

    DocumentPreprocessor dp = new DocumentPreprocessor(fileReader);
    dp.setTokenizerFactory(tokenizerFactory);
    int num_lines = 0;

    for (List line : dp) {
        boolean feature_exists = false;
        //            
        sentence = Sentence.listToString(line);
        Set<String> check_features = new HashSet();
        for (String feature : features) {
            Pattern pattern = Pattern.compile("\\b" + feature.toLowerCase() + "\\b", Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(sentence.toLowerCase());
            while (matcher.find()) {
                feature_exists = true;
                check_features.add(feature);
            }

        }
        if (!feature_exists) {
            //System.out.println("\n"+sentence);
            //System.out.println("No feature present!\n");
            continue;
        }

        //Features present
        //System.out.println("\nFeatures present\n");
        //                for(String feature : check_features){
        //                    //System.out.print(feature+" ");
        //                }

        //get parse tree and construct dependency tree    
        Tree tr = dr.parse(sentence);
        DependencyTree depTree = dr.getTypedDependencyTree(tr);

        //get tokenized words
        //System.out.println("\nTokenized Words\n");
        List<Word> word_list = tr.yieldWords();
        List<String> word_tokens = new ArrayList();
        for (Word word : word_list) {
            word_tokens.add(word.word());
            //System.out.print(word.word()+" ");
        }
        String[] words = new String[word_tokens.size()];
        words = word_tokens.toArray(words);

        //System.out.println("\nPOS Tags\n");
        List<TaggedWord> postags = tr.taggedYield();
        List<String> tag_tokens = new ArrayList();
        for (TaggedWord postag : postags) {
            tag_tokens.add(postag.tag());
            System.out.print(postag.tag() + " ");
        }
        String[] tags = new String[tag_tokens.size()];
        tags = tag_tokens.toArray(tags);

        //System.out.println("\nBIO Encoding\n");
        //BIO encoding for sentence

        String result[] = chunkerME.chunk(words, tags);
        for (String r : result) {
            System.out.print(r + " ");
        }

        //System.out.println("\nPhrases\n");
        //Outputs spans of BIO-NP

        HashMap<Integer, Integer> span_map = new HashMap();
        Span[] span = chunkerME.chunkAsSpans(words, tags);
        int j = 0;

        ArrayList<PhraseSet> pSets = new ArrayList();

        for (Span s : span) {

            ArrayList<String> phrase_words = new ArrayList();
            //System.out.print("\n"+s.toString()+" ");
            int n = 0;
            for (int i = s.getStart(); i < s.getEnd(); i++) {
                System.out.print(words[i] + " ");
                span_map.put(i, j);
                phrase_words.add(words[i]);
                n++;
            }

            PhraseSet pSet = new PhraseSet(j, s.toString(), phrase_words);
            pSets.add(pSet);

            j++;
        }

        //RootWord //Actual root is dummy

        DependencyTreeNode rootNode = depTree.getVertex(0).edges.get(0).target;
        Queue<DependencyTreeNode> queue = new LinkedList();
        rootNode.parent = null;
        queue.add(rootNode);

        while (!queue.isEmpty()) {

            DependencyTreeNode u = queue.remove();
            u.pos = tags[u.index - 1];
            if (span_map.get(u.index - 1) != null) {
                u.phrase_index = span_map.get(u.index - 1);

            } else {
                u.phrase_index = -1;
            }
            //System.out.println("\n"+u.word+"-"+u.phrase_index+"-"+tags[u.index-1]);
            for (DependencyTreeEdge e : u.edges) {
                e.target.parent = u;
                queue.add(e.target);
                //System.out.print(e.target.word+" ");
            }

        }

        HashMap<String, List<String>> featurePhrases = SentimentExtract.getSentimentPhrases(check_features,
                pSets, depTree);
        for (String chk_feat : check_features) {
            featuresPhrases.get(chk_feat).addAll(featurePhrases.get(chk_feat));
        }

        num_lines++;
    }

    System.out.println(num_lines);
    for (String f : features) {
        out.print(f + " ");

        out.print(String.join(" ", featuresPhrases.get(f)));
        out.println();
    }

    System.out.println("Success");
    out.close();

}

From source file:reck.parser.lexparser.RECKLexicalizedParser.java

License:Open Source License

/** Parse the files with names given in the String array args elements from
 *  index argIndex on./*from   www.  j a va2 s. c  o m*/
 */
public ArrayList parseFile(String filename, String content, int startSentence, boolean tokenized,
        TokenizerFactory tokenizerFactory, List<List<? extends HasWord>> document,
        DocumentPreprocessor documentPreprocessor, Function<List<HasWord>, List<HasWord>> escaper,
        int tagDelimiter) {
    ArrayList treeList = new ArrayList();
    PrintWriter pwOut = op.tlpParams.pw();
    PrintWriter pwErr = op.tlpParams.pw(System.err);
    RECKTreePrint treePrint = getRECKTreePrint(op);
    int numWords = 0;
    int numSents = 0;
    int numUnparsable = 0;
    int numNoMemory = 0;
    int numFallback = 0;
    int numSkipped = 0;
    Timing timer = new Timing();
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
    // set the tokenizer
    if (tokenized) {
        tokenizerFactory = WhitespaceTokenizer.factory();
    }
    if (tokenizerFactory == null) {
        tokenizerFactory = tlp.getTokenizerFactory();
    }
    if (Test.verbose) {
        System.err.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
        System.err.println("Sentence final words are: " + Arrays.asList(tlp.sentenceFinalPunctuationWords()));
        System.err.println("File encoding is: " + op.tlpParams.getInputEncoding());
    }
    documentPreprocessor.setTokenizerFactory(tokenizerFactory);
    documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
    documentPreprocessor.setEncoding(op.tlpParams.getInputEncoding());
    boolean saidMemMessage = false;

    // evaluation setup
    boolean runningAverages = Boolean.parseBoolean(Test.evals.getProperty("runningAverages"));
    boolean summary = Boolean.parseBoolean(Test.evals.getProperty("summary"));
    AbstractEval.ScoreEval pcfgLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("pcfgLL"))) {
        pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages);
    }
    AbstractEval.ScoreEval depLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("depLL"))) {
        depLL = new AbstractEval.ScoreEval("depLL", runningAverages);
    }
    AbstractEval.ScoreEval factLL = null;
    if (Boolean.parseBoolean(Test.evals.getProperty("factLL"))) {
        factLL = new AbstractEval.ScoreEval("factLL", runningAverages);
    }

    /** Hide for performance
    timer.start();
            
    System.out.println("Parsing file: " + filename + " with " + document.size() + " sentences.");*/
    PrintWriter pwo = pwOut;

    int num = 0, docIndex = startSentence;
    for (List sentence : document) {
        // System.out.println(sentence.toString());
        num++;
        numSents++;
        int len = sentence.size();
        numWords += len;

        Tree ansTree = null;
        try {
            if (!parse(sentence)) {
                pwErr.print("Sentence couldn't be parsed by grammar.");
                if (pparser != null && pparser.hasParse() && fallbackToPCFG) {
                    pwErr.println("... falling back to PCFG parse.");
                    ansTree = getBestPCFGParse();
                    numFallback++;
                } else {
                    pwErr.println();
                    numUnparsable++;
                }
            } else {
                // System.out.println("Score: " + lp.pparser.bestScore);
                ansTree = getBestParse();
            }
            if (pcfgLL != null && pparser != null) {
                pcfgLL.recordScore(pparser, pwErr);
            }
            if (depLL != null && dparser != null) {
                depLL.recordScore(dparser, pwErr);
            }
            if (factLL != null && bparser != null) {
                factLL.recordScore(bparser, pwErr);
            }
        } catch (OutOfMemoryError e) {
            if (Test.maxLength != -0xDEADBEEF) {
                // this means they explicitly asked for a length they cannot handle. Throw exception.
                pwErr.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                pwo.println("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH " + Test.maxLength);
                throw e;
            } else {
                if (!saidMemMessage) {
                    printOutOfMemory(pwErr);
                    saidMemMessage = true;
                }
                if (pparser.hasParse() && fallbackToPCFG) {
                    try {
                        String what = "dependency";
                        if (dparser.hasParse()) {
                            what = "factored";
                        }
                        pwErr.println(
                                "Sentence too long for " + what + " parser.  Falling back to PCFG parse...");
                        ansTree = getBestPCFGParse();
                        numFallback++;
                    } catch (OutOfMemoryError oome) {
                        oome.printStackTrace();
                        numNoMemory++;
                        pwErr.println("No memory to gather PCFG parse. Skipping...");
                        pwo.println("Sentence skipped:  no PCFG fallback.");
                        pparser.nudgeDownArraySize();
                    }
                } else {
                    pwErr.println(
                            "Sentence has no parse using PCFG grammar (or no PCFG fallback).  Skipping...");
                    pwo.println("Sentence skipped: no PCFG fallback.");
                    numSkipped++;
                }
            }
        } catch (UnsupportedOperationException uEx) {
            pwErr.println("Sentence too long (or zero words).");
            pwo.println("Sentence skipped: too long (or zero words).");
            numWords -= len;
            numSkipped++;
        }

        if (ansTree != null) {
            computePosition(docIndex, (Sentence) sentence, content);
            TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
            if (TDs.size() > 0)
                TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
            RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                    sentencePosition);
            DPTree = this.splitHyphen_Dependency(DPTree);
            DPTree = this.splitPoint_Dependency(DPTree);
            RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
            CTTree = this.splitHyphen_Constituent(CTTree);
            CTTree = this.splitPoint_Constituent(CTTree);
            RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree, CTTree);
            treeList.add(rpTree);
        }
        // crude addition of k-best tree printing
        if (Test.printPCFGkBest > 0 && pparser.hasParse()) {
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }

        } else if (Test.printFactoredKGood > 0 && bparser.hasParse()) {
            // DZ: debug n best trees
            if (ansTree != null) {
                computePosition(docIndex, (Sentence) sentence, content);
                TDs = treePrint.getDependencies(ansTree, reckTreeList, sentencePosition);
                if (TDs.size() > 0)
                    TDs = treePrint.orderDependencies(TDs, ansTree.getLeaves().size());
                RECKDPTreeNodeImpl DPTree = treePrint.convertToDependencyTree(ansTree, reckTreeList,
                        sentencePosition);
                DPTree = this.splitHyphen_Dependency(DPTree);
                DPTree = this.splitPoint_Dependency(DPTree);
                RECKCTTreeNodeImpl CTTree = convertToRECKTree(ansTree, docIndex, content);
                CTTree = this.splitHyphen_Constituent(CTTree);
                CTTree = this.splitPoint_Constituent(CTTree);
                RECKParseTreeImpl rpTree = new RECKParseTreeImpl(sentence, TDs, sentencePosition, DPTree,
                        CTTree);
                treeList.add(rpTree);
            }
        }

        docIndex = sentencePosition.getEnd().intValue();

    } // for sentence : document

    if (Test.writeOutputFiles) {
        pwo.close();
    }
    System.out.println("Parsed file: " + filename + " [" + num + " sentences].");

    /** Hide for performance
    long millis = timer.stop();
            
    if (summary) {
    if (pcfgLL != null) pcfgLL.display(false, pwErr);
    if (depLL != null) depLL.display(false, pwErr);
    if (factLL != null) factLL.display(false, pwErr);
    }*/

    if (saidMemMessage) {
        printOutOfMemory(pwErr);
    }
    /** Hide for performance
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
            
    System.out.println("Parsed " + numWords + " words in " + numSents +
        " sentences (" + nf.format(wordspersec) + " wds/sec; " +
        nf.format(sentspersec) + " sents/sec).");
     */
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + Test.maxLength);
        }
    }

    return treeList;
}

From source file:weka.filters.unsupervised.attribute.PartOfSpeechTagging.java

License:Open Source License

/**
 * Obtains the sentences from the document.
 *
 * @param doc   the document to turn into sentences.
 * @return the list of sentences/* w w w  .  j a  va2s  .  co  m*/
 */
protected List<String> getSentences(String doc) {
    List<String> result;
    DocumentPreprocessor preProcessor;

    result = new ArrayList<String>();

    preProcessor = new DocumentPreprocessor(new StringReader(doc));
    preProcessor.setTokenizerFactory(getTokenizerFactory());

    for (List sentence : preProcessor)
        result.add(StringUtils.joinWithOriginalWhiteSpace(sentence));

    return result;
}

From source file:weka.gui.explorer.NLPParseTreePanel.java

License:Open Source License

/**
 * Obtains the sentences from the document.
 *
 * @param doc   the document to turn into sentences.
 * @return the list of sentences/* ww  w .ja v a  2s  . co  m*/
 */
protected List<String> getSentences(String doc) {
    List<String> result;
    DocumentPreprocessor preProcessor;

    result = new ArrayList<String>();

    try {
        preProcessor = new DocumentPreprocessor(new StringReader(doc));
        preProcessor.setTokenizerFactory(PartOfSpeechTagging.getTokenizerFactory());

        for (List sentence : preProcessor)
            result.add(StringUtils.joinWithOriginalWhiteSpace(sentence));
    } catch (Exception e) {
        showErrorMessage("Parsing error", "Failed to split document into sentences!", e);
    }

    return result;
}