Example usage for edu.stanford.nlp.pipeline StanfordCoreNLP NEWLINE_SPLITTER_PROPERTY

List of usage examples for edu.stanford.nlp.pipeline StanfordCoreNLP NEWLINE_SPLITTER_PROPERTY

Introduction

In this page you can find the example usage for edu.stanford.nlp.pipeline StanfordCoreNLP NEWLINE_SPLITTER_PROPERTY.

Prototype

String NEWLINE_SPLITTER_PROPERTY

To view the source code for edu.stanford.nlp.pipeline StanfordCoreNLP NEWLINE_SPLITTER_PROPERTY.

Click Source Link

Usage

From source file:opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor.java

License:Apache License

/** Runs the tree-based sentiment model on some text. */
public void processTextWithArgs(String[] args) throws IOException {
    String parserModel = null;//  w w w  .  j a v  a  2 s  . co m
    String sentimentModel = null;

    String filename = null;
    String fileList = null;
    boolean stdin = false;

    boolean filterUnknown = false;

    List<Output> outputFormats = Collections.singletonList(Output.ROOT);
    Input inputFormat = Input.TEXT;

    String tlppClass = "DEFAULT_TLPP_CLASS";

    for (int argIndex = 0; argIndex < args.length;) {
        if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
            sentimentModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-file")) {
            filename = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-fileList")) {
            fileList = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-stdin")) {
            stdin = true;
            argIndex++;
        } else if (args[argIndex].equalsIgnoreCase("-input")) {
            inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase());
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-output")) {
            String[] formats = args[argIndex + 1].split(",");
            outputFormats = new ArrayList<>();
            for (String format : formats) {
                outputFormats.add(Output.valueOf(format.toUpperCase()));
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) {
            filterUnknown = true;
            argIndex++;
        } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) {
            tlppClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-help")) {
            System.exit(0);
        } else {
            log.info("Unknown argument " + args[argIndex + 1]);
            throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]);
        }
    }

    // We construct two pipelines. One handles tokenization, if
    // necessary. The other takes tokenized sentences and converts
    // them to sentiment trees.
    Properties pipelineProps = new Properties();
    Properties tokenizerProps = null;
    if (sentimentModel != null) {
        pipelineProps.setProperty("sentiment.model", sentimentModel);
    }
    if (parserModel != null) {
        pipelineProps.setProperty("parse.model", parserModel);
    }
    if (inputFormat == Input.TREES) {
        pipelineProps.setProperty("annotators", "binarizer, sentiment");
        pipelineProps.setProperty("customAnnotatorClass.binarizer",
                "edu.stanford.nlp.pipeline.BinarizerAnnotator");
        pipelineProps.setProperty("binarizer.tlppClass", tlppClass);
        pipelineProps.setProperty("enforceRequirements", "false");
    } else {
        pipelineProps.setProperty("annotators", "parse, sentiment");
        pipelineProps.setProperty("enforceRequirements", "false");
        tokenizerProps = new Properties();
        tokenizerProps.setProperty("annotators", "tokenize, ssplit");
    }

    if (stdin && tokenizerProps != null) {
        tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true");
    }

    int count = 0;
    if (filename != null)
        count++;
    if (fileList != null)
        count++;
    if (stdin)
        count++;
    if (count > 1) {
        throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin");
    }
    if (count == 0) {
        throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin");
    }

    StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps);

    if (filename != null) {
        // Process a file. The pipeline will do tokenization, which
        // means it will split it into sentences as best as possible
        // with the tokenizer.
        List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown);
        for (Annotation annotation : annotations) {
            pipeline.annotate(annotation);

            for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                System.out.println(sentence);
                outputTree(System.out, sentence, outputFormats);
            }
        }
    } else if (fileList != null) {
        // Process multiple files. The pipeline will do tokenization,
        // which means it will split it into sentences as best as
        // possible with the tokenizer. Output will go to filename.out
        // for each file.
        for (String file : fileList.split(",")) {
            List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown);
            FileOutputStream fout = new FileOutputStream(file + ".out");
            PrintStream pout = new PrintStream(fout);
            for (Annotation annotation : annotations) {
                pipeline.annotate(annotation);

                for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    pout.println(sentence);
                    outputTree(pout, sentence, outputFormats);
                }
            }
            pout.flush();
            fout.close();
        }
    } else {
        // Process stdin. Each line will be treated as a single sentence.
        log.info("Reading in text from stdin.");
        log.info("Please enter one sentence per line.");
        log.info("Processing will end when EOF is reached.");
        BufferedReader reader = IOUtils.readerFromStdin("utf-8");

        for (String line; (line = reader.readLine()) != null;) {
            line = line.trim();
            if (!line.isEmpty()) {
                Annotation annotation = tokenizer.process(line);
                pipeline.annotate(annotation);
                for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
                    outputTree(System.out, sentence, outputFormats);
                }
            } else {
                // Output blank lines for blank lines so the tool can be
                // used for line-by-line text processing
                System.out.println();
            }
        }

    }
}