Example usage for edu.stanford.nlp.process PTBTokenizer factory

List of usage examples for edu.stanford.nlp.process PTBTokenizer factory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer factory.

Prototype

public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) 

Source Link

Document

Get a TokenizerFactory that does Penn Treebank tokenization.

Usage

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

public MyMUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
    super(dict, semantics);
    String fileName = props.getProperty(Constants.MUC_PROP);
    fileContents = IOUtils.slurpFile(fileName);
    currentOffset = 0;//from  w  ww .  jav a2 s. c  o  m
    tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "invertible");
    stanfordProcessor = loadStanfordProcessor(props);

    if (props.containsKey(MyConstants.USE_GOLD_MENTION_PROP)) {
        useGoldMention = true;
        System.err.println("Using Gold Mention");
    } else {
        useGoldMention = false;
        System.err.println("Not Using Gold Mention");
    }

    if (props.containsKey(MyConstants.EXP_TYPE_PROP)) {
        experimentType = props.getProperty(MyConstants.EXP_TYPE_PROP);
    } else {
        experimentType = null;
    }

    tte_type = props.getProperty(MyConstants.TTE_TYPE);

    if (props.containsKey(MyConstants.TTE_TYPE) && tte_type.equals(MyConstants.TTE_TYPE_USE)
            && props.containsKey(MyConstants.TTE_MODEL)) {
        System.err.println("MUC Extract Use term");
        use_term = true;
        System.err.println(tte_type);

        termAsMentionFinder = new TermAsMentionFinder();
    }

    this.props = props;
}

From source file:csav2.pkg0.ParserTagging.java

public String dependency(String text) {
    String output = "";
    Annotation document = new Annotation(text);
    try {/*from w  w  w. j a  v a 2s . co  m*/
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println("Exception while calling method annotate(Method dependency):" + e);
    }
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List wordList = tokenizerFactory.getTokenizer(new StringReader(sentence.toString())).tokenize();
        //System.out.println(wordList.toString());
        Tree tree = lp.apply(wordList);
        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection cc = gs.typedDependenciesCCprocessed(true);
        output += cc.toString() + "\n";
    }
    //System.out.println(output);
    return output;
}

From source file:DependencyParser.Parser.java

public void CallParser(String text) // start of the main method

{
    try {/*from   w ww. j ava  2  s  .com*/

        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        LexicalizedParser lp = LexicalizedParser
                .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
        lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
        TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
        Tree tree = lp.apply(wordList);

        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);
        System.out.println(tdl);

        PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt");
        TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
        tp.printTree(tree, pw);

        pw.close();
        Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3);
        assert (new File("image.png").exists());
    } catch (FileNotFoundException f) {

    } catch (Exception ex) {
        Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:Engines.Test.StanfordParser.TreeHandling.java

License:Open Source License

public static void test(String text) {
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);/*  ww  w. j a v  a 2  s .  com*/
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);

}

From source file:englishparser.EnglishParser.java

/**
 * demoAPI demonstrates other ways of calling the parser with already
 * tokenized text, or in some cases, raw text that needs to be tokenized as
 * a single sentence. Output is handled with a TreePrint object. Note that
 * the options used when creating the TreePrint can determine what results
 * to print out. Once again, one can capture the output by passing a
 * PrintWriter to TreePrint.printTree./*from   w  w w . ja v  a  2 s.  co  m*/
 */
public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "This", "is", "an", "easy", "sentence", "." };
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);

    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();

    // You can also use a TreePrint object to print trees and dependencies
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
}

From source file:flight_ranker.TaggerDemo2.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
        return;//from  ww w  . ja va  2  s .  c  o  m
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
            "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
    List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",",
            "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
        if (tw.tag().startsWith("JJ")) {
            pw.println(tw.word());
        }
    }

    pw.close();
}

From source file:io.anserini.qa.RetrieveSentences.java

License:Apache License

public void getRankedPassages(Args args) throws Exception {
    Map<String, Float> scoredDocs = retrieveDocuments(args);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();

    IndexUtils util = new IndexUtils(args.index);

    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

    for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
        List<Sentence> sentences = util.getSentDocument(doc.getKey());

        for (Sentence sent : sentences) {
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }/*ww w . jav  a2s .c  o m*/
    }

    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream()
            .map(CoreLabel::toString).collect(Collectors.joining(" "));
    scorer.score(queryTokens, sentencesMap);

    List<ScoredPassage> topPassages = scorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        System.out.println(s.getSentence() + " " + s.getScore());
    }
}

From source file:knu.univ.lingvo.coref.MUCMentionExtractor.java

License:Open Source License

public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
    super(dict, semantics);
    String fileName = props.getProperty(Constants.MUC_PROP);
    fileContents = IOUtils.slurpFile(fileName);
    currentOffset = 0;//from w w  w  . ja va2  s.  co  m
    tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
    stanfordProcessor = loadStanfordProcessor(props);
}

From source file:nlpedit.ui.NLPEditPanel.java

License:Open Source License

public NLPEditPanel() {
    buildGUI();/*from  w  ww . j  a v a  2 s  .  com*/

    highlightStyle = new SimpleAttributeSet();
    normalStyle = new SimpleAttributeSet();
    StyleConstants.setBackground(highlightStyle, Color.yellow);
    StyleConstants.setBackground(normalStyle, textPane.getBackground());

    lp = LexicalizedParser.loadModel("englishPCFG.ser.gz");
    lpq = lp.parserQuery();
    tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

    project = null;
}

From source file:nlpedit.ui.NLPTreeEx.java

License:Open Source License

public NLPTreeEx() {
    initComponents();/*from   w  ww  .  j a  va  2  s  . c  o m*/
    setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
    lp = LexicalizedParser.loadModel("englishPCFG.ser.gz");
    lpq = lp.parserQuery();
    tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    setVisible(true);
}