List of usage examples for edu.stanford.nlp.io IOUtils readerFromString
public static BufferedReader readerFromString(String textFileOrUrl, String encoding) throws IOException
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
public void initGazette() { try {/*from w w w.jav a 2 s .co m*/ // read in gazettes if (flags.gazettes == null) { flags.gazettes = new ArrayList<String>(); } List<String> gazettes = flags.gazettes; for (String gazetteFile : gazettes) { BufferedReader r = IOUtils.readerFromString(gazetteFile, flags.inputEncoding); readGazette(r); r.close(); } } catch (IOException e) { throw new RuntimeException(e); } }
From source file:process.PTBTokenizer.java
License:Open Source License
private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException { final long start = System.nanoTime(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); numTokens = ptb2Text(r, writer); writer.close();// w w w . j ava2 s . c o m } else { for (int j = 0; j < sz; j++) { Reader r = IOUtils.readerFromString(inputFileList.get(j), charset); BufferedWriter writer; if (outputFileList == null) { writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); } else { writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)); } numTokens += ptb2Text(r, writer); writer.close(); r.close(); } } final long duration = System.nanoTime() - start; final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0); System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec); }
From source file:process.PTBTokenizer.java
License:Open Source License
private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException { final long start = System.nanoTime(); long numTokens = 0; int numFiles = inputFileList.size(); if (numFiles == 0) { Reader stdin = IOUtils.readerFromStdin(charset); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); numTokens += tokReader(stdin, writer, parseInsidePattern, options, preserveLines, dump, lowerCase); IOUtils.closeIgnoringExceptions(writer); } else {/* w w w . j a va2s.c o m*/ for (int j = 0; j < numFiles; j++) { Reader r = IOUtils.readerFromString(inputFileList.get(j), charset); BufferedWriter out = (outputFileList == null) ? new BufferedWriter(new OutputStreamWriter(System.out, charset)) : new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)); numTokens += tokReader(r, out, parseInsidePattern, options, preserveLines, dump, lowerCase); r.close(); IOUtils.closeIgnoringExceptions(out); } // end for j going through inputFileList } final long duration = System.nanoTime() - start; final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0); System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec); }
From source file:process.PTBTokenizer.java
License:Open Source License
/** * Reads files given as arguments and print their tokens, by default as one * per line. This is useful either for testing or to run standalone to turn * a corpus into a one-token-per-line file of tokens. This main method * assumes that the input file is in utf-8 encoding, unless an encoding is * specified./*from ww w.j a v a 2 s . c o m*/ * <p/> * Usage: <code> * java edu.stanford.nlp.process.PTBTokenizer [options] filename+ * </code> * <p/> * Options: * <ul> * <li>-options options Set various tokenization options (see the * documentation in the class javadoc) * <li>-preserveLines Produce space-separated tokens, except when the * original had a line break, not one-token-per-line * <li>-encoding encoding Specifies a character encoding. If you do not * specify one, the default is utf-8 (not the platform default). * <li>-lowerCase Lowercase all tokens (on tokenization) * <li>-parseInside regex Names an XML-style element or a regular expression * over such elements. The tokenizer will only tokenize inside elements that * match this regex. (This is done by regex matching, not an XML parser, but * works well for simple XML documents, or other SGML-style documents, such * as Linguistic Data Consortium releases, which adopt the convention that a * line of a file is either XML markup or character data but never both.) * <li>-ioFileList file* The remaining command-line arguments are treated as * filenames that themselves contain lists of pairs of input-output * filenames (2 column, whitespace separated). * <li>-dump Print the whole of each CoreLabel, not just the value (word) * <li>-untok Heuristically untokenize tokenized text * <li>-h, -help Print usage info * </ul> * * @param args * Command line arguments * @throws IOException * If any file I/O problem */ public static void main(String[] args) throws IOException { edu.stanford.nlp.process.PTBTokenizer<HasWord> abctesTokenizer; Properties options = StringUtils.argsToProperties(args, optionArgDefs()); boolean showHelp = PropertiesUtils.getBool(options, "help", false); showHelp = PropertiesUtils.getBool(options, "h", showHelp); if (showHelp) { System.err.println("Usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*"); System.err.println( " options: -h|-preserveLines|-lowerCase|-dump|-ioFileList|-encoding|-parseInside|-options"); System.exit(0); } StringBuilder optionsSB = new StringBuilder(); String tokenizerOptions = options.getProperty("options", null); if (tokenizerOptions != null) { optionsSB.append(tokenizerOptions); } boolean preserveLines = PropertiesUtils.getBool(options, "preserveLines", false); if (preserveLines) { optionsSB.append(",tokenizeNLs"); } boolean inputOutputFileList = PropertiesUtils.getBool(options, "ioFileList", false); boolean lowerCase = PropertiesUtils.getBool(options, "lowerCase", false); boolean dump = PropertiesUtils.getBool(options, "dump", false); boolean untok = PropertiesUtils.getBool(options, "untok", false); String charset = options.getProperty("encoding", "utf-8"); String parseInsideKey = options.getProperty("parseInside", null); Pattern parseInsidePattern = null; if (parseInsideKey != null) { try { parseInsidePattern = Pattern.compile("<(/?)(?:" + parseInsideKey + ")(?:\\s[^>]*?)?>"); } catch (PatternSyntaxException e) { // just go with null parseInsidePattern } } // Other arguments are filenames String parsedArgStr = options.getProperty("", null); String[] parsedArgs = (parsedArgStr == null) ? null : parsedArgStr.split("\\s+"); ArrayList<String> inputFileList = new ArrayList<String>(); ArrayList<String> outputFileList = null; if (inputOutputFileList && parsedArgs != null) { outputFileList = new ArrayList<String>(); for (String fileName : parsedArgs) { BufferedReader r = IOUtils.readerFromString(fileName, charset); for (String inLine; (inLine = r.readLine()) != null;) { String[] fields = inLine.split("\\s+"); inputFileList.add(fields[0]); if (fields.length > 1) { outputFileList.add(fields[1]); } else { outputFileList.add(fields[0] + ".tok"); } } r.close(); } } else if (parsedArgs != null) { // Concatenate input files into a single output file inputFileList.addAll(Arrays.asList(parsedArgs)); } if (untok) { untok(inputFileList, outputFileList, charset); } else { tok(inputFileList, outputFileList, charset, parseInsidePattern, optionsSB.toString(), preserveLines, dump, lowerCase); } }