List of usage examples for edu.stanford.nlp.process CoreLabelTokenFactory CoreLabelTokenFactory
public CoreLabelTokenFactory()
From source file:csav2.pkg0.ParserTagging.java
public String dependency(String text) { String output = ""; Annotation document = new Annotation(text); try {/*from www . j a v a 2s. c o m*/ pipeline.annotate(document); } catch (Exception e) { System.out.println("Exception while calling method annotate(Method dependency):" + e); } List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List wordList = tokenizerFactory.getTokenizer(new StringReader(sentence.toString())).tokenize(); //System.out.println(wordList.toString()); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection cc = gs.typedDependenciesCCprocessed(true); output += cc.toString() + "\n"; } //System.out.println(output); return output; }
From source file:de.andreasschoknecht.LS3.PNMLReader.java
License:Open Source License
/** * Creates the term lists for a process model (LS3Document) in a model collection. Adds the terms to the document itself as Bag-of-Words and adds the terms to * the HashSet of terms of the document collection. This method is used when parsing a document collection. * * @param labels The labels contained in the PNML file * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document * @param documentCollection The DocumentCollection for updating the term list of the whole collection * @throws IOException if stop word file could not be read */// w w w.j av a 2s. c o m private void createTermLists(List<Object> labels, LS3Document ls3Document, DocumentCollection documentCollection) throws IOException { initializeWordList(); ArrayList<String> tokens = new ArrayList<String>(); String label = ""; for (Object temp : labels) { Element value = (Element) temp; label = label + value.getText() + " "; } PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(), "untokenizable=allKeep"); while (ptbt.hasNext()) { tokens.add(ptbt.next().value()); } for (int i = 0, j = tokens.size(); i < j; i++) { String bereinigt = tokens.get(i).toLowerCase(); // Clear tokens of empty tokens, stop words, and automatic tool labels if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) { String term = bereinigt.replaceAll("[0-9]+", ""); ls3Document.addTerm(stemString(term)); documentCollection.addTerm(stemString(term)); } } }
From source file:de.andreasschoknecht.LS3.PNMLReader.java
License:Open Source License
/** * Creates the term list for a process model (LS3Document). It only adds the terms to the document itself as Bag-of-Words. * This method is used when parsing a query model. * * @param labels The labels contained in the PNML file * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document * @throws IOException if stop word file could not be read *//*from ww w . j a v a 2s.co m*/ private void createTermLists(List<Object> labels, LS3Document ls3Document) throws IOException { initializeWordList(); ArrayList<String> tokens = new ArrayList<String>(); String label = ""; for (Object temp : labels) { Element value = (Element) temp; label = label + value.getText() + " "; } PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(), "untokenizable=allKeep"); while (ptbt.hasNext()) { tokens.add(ptbt.next().value()); } for (int i = 0, j = tokens.size(); i < j; i++) { String bereinigt = tokens.get(i).toLowerCase(); // Clear tokens of empty tokens, stop words, and automatic tool labels if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) { String term = bereinigt.replaceAll("[0-9]+", ""); ls3Document.addTerm(stemString(term)); } } }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java
License:Open Source License
@Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible"); for (CoreLabel label : tokenizer.tokenize()) { replace(label.beginPosition(), label.endPosition(), label.word()); }//w w w .j av a 2 s. c o m }
From source file:DependencyParser.Parser.java
public void CallParser(String text) // start of the main method { try {/* www. j a v a 2s.c o m*/ TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); System.out.println(tdl); PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt"); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(tree, pw); pw.close(); Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3); assert (new File("image.png").exists()); } catch (FileNotFoundException f) { } catch (Exception ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:edu.illinois.cs.cogcomp.pipeline.handlers.StanfordParseHandler.java
License:Open Source License
static List<CoreMap> buildStanfordSentences(TextAnnotation ta) { View tokens = ta.getView(ViewNames.TOKENS); View sentences = ta.getView(ViewNames.SENTENCE); String rawText = ta.getText(); List<CoreMap> stanfordSentences = new LinkedList<>(); List<CoreLabel> stanfordTokens = new LinkedList<>(); int tokIndex = 0; int sentIndex = 0; Constituent currentSentence = sentences.getConstituents().get(0); String sentText = rawText.substring(currentSentence.getStartCharOffset(), currentSentence.getEndCharOffset()); CoreLabelTokenFactory tf = new CoreLabelTokenFactory(); for (Constituent tok : tokens.getConstituents()) { if (tok.getStartSpan() >= currentSentence.getEndSpan()) { CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex++, stanfordTokens);/*from w ww.j a va 2 s .c o m*/ stanfordSentences.add(stanfordSentence); stanfordTokens = new LinkedList<>(); currentSentence = sentences.getConstituents().get(sentIndex); sentText = rawText.substring(currentSentence.getStartCharOffset(), currentSentence.getEndCharOffset()); } int tokStart = tok.getStartCharOffset(); int tokLength = tok.getEndCharOffset() - tokStart; String form = rawText.substring(tokStart, tok.getEndCharOffset()); CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength); stanfordTok.setIndex(tokIndex++); stanfordTokens.add(stanfordTok); } // should be one last sentence CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex, stanfordTokens); stanfordSentences.add(stanfordSentence); return stanfordSentences; }
From source file:Engines.Test.StanfordParser.TreeHandling.java
License:Open Source License
public static void test(String text) { TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList);/* w w w .j a va 2 s. c om*/ GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); }
From source file:englishparser.EnglishParser.java
/** * demoAPI demonstrates other ways of calling the parser with already * tokenized text, or in some cases, raw text that needs to be tokenized as * a single sentence. Output is handled with a TreePrint object. Note that * the options used when creating the TreePrint can determine what results * to print out. Once again, one can capture the output by passing a * PrintWriter to TreePrint.printTree./*from w w w. j a v a 2s .c o m*/ */ public static void demoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words String[] sent = { "This", "is", "an", "easy", "sentence", "." }; List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); System.out.println(); // This option shows loading and using an explicit tokenizer String sent2 = "This is another sentence."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2)); List<CoreLabel> rawWords2 = tok.tokenize(); parse = lp.apply(rawWords2); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
From source file:flight_ranker.TaggerDemo2.java
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return;/*www. j ava2 s .c om*/ } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
From source file:gate.stanford.Tokenizer.java
License:Open Source License
@Override public void execute() throws ExecutionException { // check the parameters if (document == null) throw new ExecutionException("No document to process!"); AnnotationSet inputAS = document.getAnnotations(inputASName); AnnotationSet outputAS = document.getAnnotations(outputASName); long startTime = System.currentTimeMillis(); fireStatusChanged("Tokenising " + document.getName()); fireProgressChanged(0);/*from w w w. jav a 2 s . c om*/ // tokenising goes here String rawText = ""; try { rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString(); } catch (Exception e) { System.out.println("Document content offsets wrong: " + e); } PTBTokenizer<CoreLabel> ptbt; try { ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(), "invertible=true"); } catch (Exception e) { System.out.println("Failed when calling tokenizer: " + e); return; } Long tokenStart; Long tokenEnd; Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces for (CoreLabel label; ptbt.hasNext();) { label = ptbt.next(); tokenStart = new Long(label.beginPosition()); tokenEnd = new Long(label.endPosition()); SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl(); // add the token annotation try { tokenMap.put(TOKEN_STRING_FEATURE, document.getContent().getContent(tokenStart, tokenEnd).toString()); outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap); } catch (InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } // do we need to add a space annotation? if (tokenStart > prevTokenEnd) { try { outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl()); } catch (InvalidOffsetException e) { System.out.println("Space token alignment problem:" + e); } } prevTokenEnd = tokenEnd; } fireProcessFinished(); fireStatusChanged(document.getName() + " tokenised in " + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000) + " seconds!"); }