List of usage examples for edu.stanford.nlp.process PTBTokenizer factory
public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options)
From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java
License:Open Source License
public MyMUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0;//from w ww . jav a2 s. c o m tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "invertible"); stanfordProcessor = loadStanfordProcessor(props); if (props.containsKey(MyConstants.USE_GOLD_MENTION_PROP)) { useGoldMention = true; System.err.println("Using Gold Mention"); } else { useGoldMention = false; System.err.println("Not Using Gold Mention"); } if (props.containsKey(MyConstants.EXP_TYPE_PROP)) { experimentType = props.getProperty(MyConstants.EXP_TYPE_PROP); } else { experimentType = null; } tte_type = props.getProperty(MyConstants.TTE_TYPE); if (props.containsKey(MyConstants.TTE_TYPE) && tte_type.equals(MyConstants.TTE_TYPE_USE) && props.containsKey(MyConstants.TTE_MODEL)) { System.err.println("MUC Extract Use term"); use_term = true; System.err.println(tte_type); termAsMentionFinder = new TermAsMentionFinder(); } this.props = props; }
From source file:csav2.pkg0.ParserTagging.java
public String dependency(String text) { String output = ""; Annotation document = new Annotation(text); try {/*from w w w. j a v a 2s . co m*/ pipeline.annotate(document); } catch (Exception e) { System.out.println("Exception while calling method annotate(Method dependency):" + e); } List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List wordList = tokenizerFactory.getTokenizer(new StringReader(sentence.toString())).tokenize(); //System.out.println(wordList.toString()); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection cc = gs.typedDependenciesCCprocessed(true); output += cc.toString() + "\n"; } //System.out.println(output); return output; }
From source file:DependencyParser.Parser.java
public void CallParser(String text) // start of the main method { try {/*from w ww. j ava 2 s .com*/ TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); System.out.println(tdl); PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt"); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(tree, pw); pw.close(); Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3); assert (new File("image.png").exists()); } catch (FileNotFoundException f) { } catch (Exception ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:Engines.Test.StanfordParser.TreeHandling.java
License:Open Source License
public static void test(String text) { TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList);/* ww w. j a v a 2 s . com*/ GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); }
From source file:englishparser.EnglishParser.java
/** * demoAPI demonstrates other ways of calling the parser with already * tokenized text, or in some cases, raw text that needs to be tokenized as * a single sentence. Output is handled with a TreePrint object. Note that * the options used when creating the TreePrint can determine what results * to print out. Once again, one can capture the output by passing a * PrintWriter to TreePrint.printTree./*from w w w . ja v a 2 s. co m*/ */ public static void demoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words String[] sent = { "This", "is", "an", "easy", "sentence", "." }; List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); System.out.println(); // This option shows loading and using an explicit tokenizer String sent2 = "This is another sentence."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2)); List<CoreLabel> rawWords2 = tok.tokenize(); parse = lp.apply(rawWords2); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
From source file:flight_ranker.TaggerDemo2.java
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return;//from ww w . ja va 2 s . c o m } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
From source file:io.anserini.qa.RetrieveSentences.java
License:Apache License
public void getRankedPassages(Args args) throws Exception { Map<String, Float> scoredDocs = retrieveDocuments(args); Map<String, Float> sentencesMap = new LinkedHashMap<>(); IndexUtils util = new IndexUtils(args.index); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) { List<Sentence> sentences = util.getSentDocument(doc.getKey()); for (Sentence sent : sentences) { List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize(); String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" ")); sentencesMap.put(answerTokens, doc.getValue()); }/*ww w . jav a2s .c o m*/ } String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream() .map(CoreLabel::toString).collect(Collectors.joining(" ")); scorer.score(queryTokens, sentencesMap); List<ScoredPassage> topPassages = scorer.extractTopPassages(); for (ScoredPassage s : topPassages) { System.out.println(s.getSentence() + " " + s.getScore()); } }
From source file:knu.univ.lingvo.coref.MUCMentionExtractor.java
License:Open Source License
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0;//from w w w . ja va2 s. co m tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
From source file:nlpedit.ui.NLPEditPanel.java
License:Open Source License
public NLPEditPanel() { buildGUI();/*from w ww . j a v a 2 s . com*/ highlightStyle = new SimpleAttributeSet(); normalStyle = new SimpleAttributeSet(); StyleConstants.setBackground(highlightStyle, Color.yellow); StyleConstants.setBackground(normalStyle, textPane.getBackground()); lp = LexicalizedParser.loadModel("englishPCFG.ser.gz"); lpq = lp.parserQuery(); tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); project = null; }
From source file:nlpedit.ui.NLPTreeEx.java
License:Open Source License
public NLPTreeEx() { initComponents();/*from w ww . j a va 2 s . c o m*/ setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); lp = LexicalizedParser.loadModel("englishPCFG.ser.gz"); lpq = lp.parserQuery(); tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); setVisible(true); }