Example usage for edu.stanford.nlp.io IOUtils readerFromString

Introduction

In this page you can find the example usage for edu.stanford.nlp.io IOUtils readerFromString.

Prototype

public static BufferedReader readerFromString(String textFileOrUrl, String encoding) throws IOException

Source Link

Document

Open a BufferedReader to a file or URL specified by a String name.

Usage

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

public void initGazette() {
    try {/*from w  w  w.jav  a 2  s .co  m*/
        // read in gazettes
        if (flags.gazettes == null) {
            flags.gazettes = new ArrayList<String>();
        }
        List<String> gazettes = flags.gazettes;
        for (String gazetteFile : gazettes) {
            BufferedReader r = IOUtils.readerFromString(gazetteFile, flags.inputEncoding);
            readGazette(r);
            r.close();
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:process.PTBTokenizer.java

License:Open Source License

private static void untok(List<String> inputFileList, List<String> outputFileList, String charset)
        throws IOException {
    final long start = System.nanoTime();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
        Reader r = new InputStreamReader(System.in, charset);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
        numTokens = ptb2Text(r, writer);
        writer.close();//  w  w  w .  j ava2 s .  c  o m
    } else {
        for (int j = 0; j < sz; j++) {
            Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
            BufferedWriter writer;
            if (outputFileList == null) {
                writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
            } else {
                writer = new BufferedWriter(
                        new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
            }
            numTokens += ptb2Text(r, writer);
            writer.close();
            r.close();
        }
    }
    final long duration = System.nanoTime() - start;
    final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
    System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens,
            wordsPerSec);
}

From source file:process.PTBTokenizer.java

License:Open Source License

private static void tok(List<String> inputFileList, List<String> outputFileList, String charset,
        Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase)
        throws IOException {
    final long start = System.nanoTime();
    long numTokens = 0;
    int numFiles = inputFileList.size();
    if (numFiles == 0) {
        Reader stdin = IOUtils.readerFromStdin(charset);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
        numTokens += tokReader(stdin, writer, parseInsidePattern, options, preserveLines, dump, lowerCase);
        IOUtils.closeIgnoringExceptions(writer);

    } else {/*  w w  w  .  j a va2s.c  o  m*/
        for (int j = 0; j < numFiles; j++) {
            Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
            BufferedWriter out = (outputFileList == null)
                    ? new BufferedWriter(new OutputStreamWriter(System.out, charset))
                    : new BufferedWriter(
                            new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
            numTokens += tokReader(r, out, parseInsidePattern, options, preserveLines, dump, lowerCase);
            r.close();
            IOUtils.closeIgnoringExceptions(out);
        } // end for j going through inputFileList
    }

    final long duration = System.nanoTime() - start;
    final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
    System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
}

From source file:process.PTBTokenizer.java

License:Open Source License

/**
 * Reads files given as arguments and print their tokens, by default as one
 * per line. This is useful either for testing or to run standalone to turn
 * a corpus into a one-token-per-line file of tokens. This main method
 * assumes that the input file is in utf-8 encoding, unless an encoding is
 * specified./*from ww w.j a v a  2  s .  c  o m*/
 * <p/>
 * Usage: <code>
 * java edu.stanford.nlp.process.PTBTokenizer [options] filename+
 * </code>
 * <p/>
 * Options:
 * <ul>
 * <li>-options options Set various tokenization options (see the
 * documentation in the class javadoc)
 * <li>-preserveLines Produce space-separated tokens, except when the
 * original had a line break, not one-token-per-line
 * <li>-encoding encoding Specifies a character encoding. If you do not
 * specify one, the default is utf-8 (not the platform default).
 * <li>-lowerCase Lowercase all tokens (on tokenization)
 * <li>-parseInside regex Names an XML-style element or a regular expression
 * over such elements. The tokenizer will only tokenize inside elements that
 * match this regex. (This is done by regex matching, not an XML parser, but
 * works well for simple XML documents, or other SGML-style documents, such
 * as Linguistic Data Consortium releases, which adopt the convention that a
 * line of a file is either XML markup or character data but never both.)
 * <li>-ioFileList file* The remaining command-line arguments are treated as
 * filenames that themselves contain lists of pairs of input-output
 * filenames (2 column, whitespace separated).
 * <li>-dump Print the whole of each CoreLabel, not just the value (word)
 * <li>-untok Heuristically untokenize tokenized text
 * <li>-h, -help Print usage info
 * </ul>
 * 
 * @param args
 *            Command line arguments
 * @throws IOException
 *             If any file I/O problem
 */
public static void main(String[] args) throws IOException {

    edu.stanford.nlp.process.PTBTokenizer<HasWord> abctesTokenizer;

    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    boolean showHelp = PropertiesUtils.getBool(options, "help", false);
    showHelp = PropertiesUtils.getBool(options, "h", showHelp);
    if (showHelp) {
        System.err.println("Usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
        System.err.println(
                "  options: -h|-preserveLines|-lowerCase|-dump|-ioFileList|-encoding|-parseInside|-options");
        System.exit(0);
    }

    StringBuilder optionsSB = new StringBuilder();
    String tokenizerOptions = options.getProperty("options", null);
    if (tokenizerOptions != null) {
        optionsSB.append(tokenizerOptions);
    }
    boolean preserveLines = PropertiesUtils.getBool(options, "preserveLines", false);
    if (preserveLines) {
        optionsSB.append(",tokenizeNLs");
    }
    boolean inputOutputFileList = PropertiesUtils.getBool(options, "ioFileList", false);
    boolean lowerCase = PropertiesUtils.getBool(options, "lowerCase", false);
    boolean dump = PropertiesUtils.getBool(options, "dump", false);
    boolean untok = PropertiesUtils.getBool(options, "untok", false);
    String charset = options.getProperty("encoding", "utf-8");
    String parseInsideKey = options.getProperty("parseInside", null);
    Pattern parseInsidePattern = null;
    if (parseInsideKey != null) {
        try {
            parseInsidePattern = Pattern.compile("<(/?)(?:" + parseInsideKey + ")(?:\\s[^>]*?)?>");
        } catch (PatternSyntaxException e) {
            // just go with null parseInsidePattern
        }
    }

    // Other arguments are filenames
    String parsedArgStr = options.getProperty("", null);
    String[] parsedArgs = (parsedArgStr == null) ? null : parsedArgStr.split("\\s+");

    ArrayList<String> inputFileList = new ArrayList<String>();
    ArrayList<String> outputFileList = null;
    if (inputOutputFileList && parsedArgs != null) {
        outputFileList = new ArrayList<String>();
        for (String fileName : parsedArgs) {
            BufferedReader r = IOUtils.readerFromString(fileName, charset);
            for (String inLine; (inLine = r.readLine()) != null;) {
                String[] fields = inLine.split("\\s+");
                inputFileList.add(fields[0]);
                if (fields.length > 1) {
                    outputFileList.add(fields[1]);
                } else {
                    outputFileList.add(fields[0] + ".tok");
                }
            }
            r.close();
        }
    } else if (parsedArgs != null) {
        // Concatenate input files into a single output file
        inputFileList.addAll(Arrays.asList(parsedArgs));
    }

    if (untok) {
        untok(inputFileList, outputFileList, charset);
    } else {
        tok(inputFileList, outputFileList, charset, parseInsidePattern, optionsSB.toString(), preserveLines,
                dump, lowerCase);
    }
}