List of usage examples for edu.stanford.nlp.process PTBTokenizer PTBTokenizer
public PTBTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options)
From source file:DependencyParse.java
License:Apache License
public static void main(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(args); if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) { System.err.println(// w ww . ja v a2 s.co m "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>"); System.exit(1); } boolean tokenize = false; if (props.containsKey("tokenize")) { tokenize = true; } String tokPath = props.getProperty("tokpath"); String parentPath = props.getProperty("parentpath"); String relPath = props.getProperty("relpath"); BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath)); BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath)); BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath)); MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL); DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL); Scanner stdin = new Scanner(System.in); int count = 0; long start = System.currentTimeMillis(); while (stdin.hasNextLine()) { String line = stdin.nextLine(); List<HasWord> tokens = new ArrayList<>(); if (tokenize) { PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); for (Word label; tokenizer.hasNext();) { tokens.add(tokenizer.next()); } } else { for (String word : line.split(" ")) { tokens.add(new Word(word)); } } List<TaggedWord> tagged = tagger.tagSentence(tokens); int len = tagged.size(); Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies(); int[] parents = new int[len]; for (int i = 0; i < len; i++) { // if a node has a parent of -1 at the end of parsing, then the node // has no parent. parents[i] = -1; } String[] relns = new String[len]; for (TypedDependency td : tdl) { // let root have index 0 int child = td.dep().index(); int parent = td.gov().index(); relns[child - 1] = td.reln().toString(); parents[child - 1] = parent; } // print tokens StringBuilder sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { if (tokenize) { sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word())); } else { sb.append(tokens.get(i).word()); } sb.append(' '); } if (tokenize) { sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word())); } else { sb.append(tokens.get(len - 1).word()); } sb.append('\n'); tokWriter.write(sb.toString()); // print parent pointers sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { sb.append(parents[i]); sb.append(' '); } sb.append(parents[len - 1]); sb.append('\n'); parentWriter.write(sb.toString()); // print relations sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { sb.append(relns[i]); sb.append(' '); } sb.append(relns[len - 1]); sb.append('\n'); relWriter.write(sb.toString()); count++; if (count % 1000 == 0) { double elapsed = (System.currentTimeMillis() - start) / 1000.0; System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed); } } long totalTimeMillis = System.currentTimeMillis() - start; System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count); tokWriter.close(); parentWriter.close(); relWriter.close(); }
From source file:ConstituencyParse.java
License:Apache License
public List<HasWord> sentenceToTokens(String line) { List<HasWord> tokens = new ArrayList<>(); if (tokenize) { PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); for (Word label; tokenizer.hasNext();) { tokens.add(tokenizer.next()); }//from w w w. j av a2s . c o m } else { for (String word : line.split(" ")) { tokens.add(new Word(word)); } } return tokens; }
From source file:de.andreasschoknecht.LS3.PNMLReader.java
License:Open Source License
/** * Creates the term lists for a process model (LS3Document) in a model collection. Adds the terms to the document itself as Bag-of-Words and adds the terms to * the HashSet of terms of the document collection. This method is used when parsing a document collection. * * @param labels The labels contained in the PNML file * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document * @param documentCollection The DocumentCollection for updating the term list of the whole collection * @throws IOException if stop word file could not be read *//*w w w .ja va2 s .c om*/ private void createTermLists(List<Object> labels, LS3Document ls3Document, DocumentCollection documentCollection) throws IOException { initializeWordList(); ArrayList<String> tokens = new ArrayList<String>(); String label = ""; for (Object temp : labels) { Element value = (Element) temp; label = label + value.getText() + " "; } PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(), "untokenizable=allKeep"); while (ptbt.hasNext()) { tokens.add(ptbt.next().value()); } for (int i = 0, j = tokens.size(); i < j; i++) { String bereinigt = tokens.get(i).toLowerCase(); // Clear tokens of empty tokens, stop words, and automatic tool labels if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) { String term = bereinigt.replaceAll("[0-9]+", ""); ls3Document.addTerm(stemString(term)); documentCollection.addTerm(stemString(term)); } } }
From source file:de.andreasschoknecht.LS3.PNMLReader.java
License:Open Source License
/** * Creates the term list for a process model (LS3Document). It only adds the terms to the document itself as Bag-of-Words. * This method is used when parsing a query model. * * @param labels The labels contained in the PNML file * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document * @throws IOException if stop word file could not be read */// w ww.j a v a 2s.com private void createTermLists(List<Object> labels, LS3Document ls3Document) throws IOException { initializeWordList(); ArrayList<String> tokens = new ArrayList<String>(); String label = ""; for (Object temp : labels) { Element value = (Element) temp; label = label + value.getText() + " "; } PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(), "untokenizable=allKeep"); while (ptbt.hasNext()) { tokens.add(ptbt.next().value()); } for (int i = 0, j = tokens.size(); i < j; i++) { String bereinigt = tokens.get(i).toLowerCase(); // Clear tokens of empty tokens, stop words, and automatic tool labels if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) { String term = bereinigt.replaceAll("[0-9]+", ""); ls3Document.addTerm(stemString(term)); } } }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java
License:Open Source License
@Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible"); for (CoreLabel label : tokenizer.tokenize()) { replace(label.beginPosition(), label.endPosition(), label.word()); }/*from www. j av a2 s.co m*/ }
From source file:gate.stanford.Tokenizer.java
License:Open Source License
@Override public void execute() throws ExecutionException { // check the parameters if (document == null) throw new ExecutionException("No document to process!"); AnnotationSet inputAS = document.getAnnotations(inputASName); AnnotationSet outputAS = document.getAnnotations(outputASName); long startTime = System.currentTimeMillis(); fireStatusChanged("Tokenising " + document.getName()); fireProgressChanged(0);/*from w w w. j a v a2 s . c o m*/ // tokenising goes here String rawText = ""; try { rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString(); } catch (Exception e) { System.out.println("Document content offsets wrong: " + e); } PTBTokenizer<CoreLabel> ptbt; try { ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(), "invertible=true"); } catch (Exception e) { System.out.println("Failed when calling tokenizer: " + e); return; } Long tokenStart; Long tokenEnd; Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces for (CoreLabel label; ptbt.hasNext();) { label = ptbt.next(); tokenStart = new Long(label.beginPosition()); tokenEnd = new Long(label.endPosition()); SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl(); // add the token annotation try { tokenMap.put(TOKEN_STRING_FEATURE, document.getContent().getContent(tokenStart, tokenEnd).toString()); outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap); } catch (InvalidOffsetException e) { System.out.println("Token alignment problem:" + e); } // do we need to add a space annotation? if (tokenStart > prevTokenEnd) { try { outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl()); } catch (InvalidOffsetException e) { System.out.println("Space token alignment problem:" + e); } } prevTokenEnd = tokenEnd; } fireProcessFinished(); fireStatusChanged(document.getName() + " tokenised in " + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000) + " seconds!"); }
From source file:gr.aueb.cs.nlp.bioasq.classifiers.Baseline.java
public static ArrayList<String> tokenize(String sentence) { ArrayList<String> tokens = new ArrayList<String>(); PTBTokenizer ptbt = new PTBTokenizer(new StringReader(sentence), new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext();) { label = (CoreLabel) ptbt.next(); tokens.add(label.toString());//w ww.j av a2s. c o m } return tokens; }
From source file:gr.aueb.cs.nlp.similarity.string.utils.DocPreprocessing.java
public static ArrayList<String> tokenizer(String text) { ArrayList<String> tokens = new ArrayList<String>(); PTBTokenizer ptbt = new PTBTokenizer(new StringReader(text), new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext();) { label = (CoreLabel) ptbt.next(); tokens.add(label.word());//from www .j a v a2 s . co m } return tokens; }
From source file:hmt.hocmay.utility.Ultilities.java
public static ArrayList<String> getWords(String filePath) { ArrayList<String> str = new ArrayList<String>(); PTBTokenizer<CoreLabel> ptbt;/*ww w . java2s. co m*/ try { ptbt = new PTBTokenizer<>(new FileReader(filePath), new CoreLabelTokenFactory(), "untokenizable=allDelete"); while (ptbt.hasNext()) { CoreLabel label = ptbt.next(); str.add(label.toString().toLowerCase()); // System.out.println(label.toString().toLowerCase()); } //Sp xp t Collections.sort(str, new Comparator<String>() { public int compare(String str1, String str2) { // System.out.println(str1+" "+str2); if (str1.compareTo(str2) > 0) { return 1; } else if (str1.compareTo(str2) == 0) { return 0; } else { return -1; } } }); //Loi b? t ging nhau for (int i = 0; i < str.size() - 1; i++) { if (str.get(i).equals(str.get(i + 1))) { str.remove(i); i--; } } } catch (FileNotFoundException ex) { } return str; }
From source file:org.lambda3.text.simplification.discourse.utils.words.WordsUtils.java
License:Open Source License
public static List<Word> splitIntoWords(String sentence) { PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(sentence), new CoreLabelTokenFactory(), "");/*from ww w .j ava2s.c o m*/ List<Word> words = new ArrayList<>(); while (ptbt.hasNext()) { CoreLabel label = ptbt.next(); words.add(new Word(label)); } return words; }