Example usage for edu.stanford.nlp.io IOUtils readerFromStdin

List of usage examples for edu.stanford.nlp.io IOUtils readerFromStdin

Introduction

In this page you can find the example usage for edu.stanford.nlp.io IOUtils readerFromStdin.

Prototype

public static BufferedReader readerFromStdin(String encoding) throws IOException 

Source Link

Document

Open a BufferedReader on stdin.

Usage

From source file:opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor.java

License:Apache License

/** Runs the tree-based sentiment model on some text. */
public void processTextWithArgs(String[] args) throws IOException {
    String parserModel = null;//from   w  w  w .  j  a v  a  2s  . co  m
    String sentimentModel = null;

    String filename = null;
    String fileList = null;
    boolean stdin = false;

    boolean filterUnknown = false;

    List<Output> outputFormats = Collections.singletonList(Output.ROOT);
    Input inputFormat = Input.TEXT;

    String tlppClass = "DEFAULT_TLPP_CLASS";

    for (int argIndex = 0; argIndex < args.length;) {
        if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
            sentimentModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-file")) {
            filename = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-fileList")) {
            fileList = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-stdin")) {
            stdin = true;
            argIndex++;
        } else if (args[argIndex].equalsIgnoreCase("-input")) {
            inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase());
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-output")) {
            String[] formats = args[argIndex + 1].split(",");
            outputFormats = new ArrayList<>();
            for (String format : formats) {
                outputFormats.add(Output.valueOf(format.toUpperCase()));
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) {
            filterUnknown = true;
            argIndex++;
        } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) {
            tlppClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-help")) {
            System.exit(0);
        } else {
            log.info("Unknown argument " + args[argIndex + 1]);
            throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]);
        }
    }

    // We construct two pipelines. One handles tokenization, if
    // necessary. The other takes tokenized sentences and converts
    // them to sentiment trees.
    Properties pipelineProps = new Properties();
    Properties tokenizerProps = null;
    if (sentimentModel != null) {
        pipelineProps.setProperty("sentiment.model", sentimentModel);
    }
    if (parserModel != null) {
        pipelineProps.setProperty("parse.model", parserModel);
    }
    if (inputFormat == Input.TREES) {
        pipelineProps.setProperty("annotators", "binarizer, sentiment");
        pipelineProps.setProperty("customAnnotatorClass.binarizer",
                "edu.stanford.nlp.pipeline.BinarizerAnnotator");
        pipelineProps.setProperty("binarizer.tlppClass", tlppClass);
        pipelineProps.setProperty("enforceRequirements", "false");
    } else {
        pipelineProps.setProperty("annotators", "parse, sentiment");
        pipelineProps.setProperty("enforceRequirements", "false");
        tokenizerProps = new Properties();
        tokenizerProps.setProperty("annotators", "tokenize, ssplit");
    }

    if (stdin && tokenizerProps != null) {
        tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true");
    }

    int count = 0;
    if (filename != null)
        count++;
    if (fileList != null)
        count++;
    if (stdin)
        count++;
    if (count > 1) {
        throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin");
    }
    if (count == 0) {
        throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin");
    }

    StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps);

    if (filename != null) {
        // Process a file. The pipeline will do tokenization, which
        // means it will split it into sentences as best as possible
        // with the tokenizer.
        List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown);
        for (Annotation annotation : annotations) {
            pipeline.annotate(annotation);

            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                System.out.println(sentence);
                outputTree(System.out, sentence, outputFormats);
            }
        }
    } else if (fileList != null) {
        // Process multiple files. The pipeline will do tokenization,
        // which means it will split it into sentences as best as
        // possible with the tokenizer. Output will go to filename.out
        // for each file.
        for (String file : fileList.split(",")) {
            List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown);
            FileOutputStream fout = new FileOutputStream(file + ".out");
            PrintStream pout = new PrintStream(fout);
            for (Annotation annotation : annotations) {
                pipeline.annotate(annotation);

                for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    pout.println(sentence);
                    outputTree(pout, sentence, outputFormats);
                }
            }
            pout.flush();
            fout.close();
        }
    } else {
        // Process stdin. Each line will be treated as a single sentence.
        log.info("Reading in text from stdin.");
        log.info("Please enter one sentence per line.");
        log.info("Processing will end when EOF is reached.");
        BufferedReader reader = IOUtils.readerFromStdin("utf-8");

        for (String line; (line = reader.readLine()) != null;) {
            line = line.trim();
            if (!line.isEmpty()) {
                Annotation annotation = tokenizer.process(line);
                pipeline.annotate(annotation);
                for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    outputTree(System.out, sentence, outputFormats);
                }
            } else {
                // Output blank lines for blank lines so the tool can be
                // used for line-by-line text processing
                System.out.println();
            }
        }

    }
}

From source file:process.PTBTokenizer.java

License:Open Source License

private static void tok(List<String> inputFileList, List<String> outputFileList, String charset,
        Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase)
        throws IOException {
    final long start = System.nanoTime();
    long numTokens = 0;
    int numFiles = inputFileList.size();
    if (numFiles == 0) {
        Reader stdin = IOUtils.readerFromStdin(charset);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
        numTokens += tokReader(stdin, writer, parseInsidePattern, options, preserveLines, dump, lowerCase);
        IOUtils.closeIgnoringExceptions(writer);

    } else {//from  w  w  w. j  a  v  a2 s.  com
        for (int j = 0; j < numFiles; j++) {
            Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
            BufferedWriter out = (outputFileList == null)
                    ? new BufferedWriter(new OutputStreamWriter(System.out, charset))
                    : new BufferedWriter(
                            new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
            numTokens += tokReader(r, out, parseInsidePattern, options, preserveLines, dump, lowerCase);
            r.close();
            IOUtils.closeIgnoringExceptions(out);
        } // end for j going through inputFileList
    }

    final long duration = System.nanoTime() - start;
    final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
    System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
}