Example usage for edu.stanford.nlp.io IOUtils slurpFileNoExceptions

Introduction

In this page you can find the example usage for edu.stanford.nlp.io IOUtils slurpFileNoExceptions.

Prototype

public static String slurpFileNoExceptions(String filename)

Source Link

Document

Returns all the text in the given file with the given encoding.

Usage

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*from w  w w .ja  va 2s  .co m*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor.java

License:Apache License

/**
 * Reads an annotation from the given filename using the requested input.
 *//*from   w  w  w.ja  v  a  2 s. c o  m*/
public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename,
        boolean filterUnknown) {
    switch (inputFormat) {
    case TEXT: {
        String text = IOUtils.slurpFileNoExceptions(filename);
        Annotation annotation = new Annotation(text);
        tokenizer.annotate(annotation);
        List<Annotation> annotations = Generics.newArrayList();
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
            nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
            annotations.add(nextAnnotation);
        }
        return annotations;
    }
    case TREES: {
        List<Tree> trees;
        if (filterUnknown) {
            trees = SentimentUtils.readTreesWithGoldLabels(filename);
            trees = SentimentUtils.filterUnknownRoots(trees);
        } else {
            trees = Generics.newArrayList();
            MemoryTreebank treebank = new MemoryTreebank("utf-8");
            treebank.loadPath(filename, null);
            for (Tree tree : treebank) {
                trees.add(tree);
            }
        }

        List<Annotation> annotations = Generics.newArrayList();
        for (Tree tree : trees) {
            CoreMap sentence = new Annotation(listToString(tree.yield()));
            sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
            List<CoreMap> sentences = Collections.singletonList(sentence);
            Annotation annotation = new Annotation("");
            annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
            annotations.add(annotation);
        }
        return annotations;
    }
    default:
        throw new IllegalArgumentException("Unknown format " + inputFormat);
    }
}

From source file:org.exist.xquery.corenlp.Tokenize.java

License:Open Source License

private String readLocalTextDocument(final TextDocType textDocType, final String localFilePath)
        throws IOException {
    String text = "";

    switch (textDocType) {
    case ODT://  www .ja va  2s .  c  o m
        try (InputStream is = new Resource(localFilePath).getInputStream()) {

            TextDocument utd = ODPackage.createFromStream(is, "UserTextDocument").getTextDocument();
            text = utd.getCharacterContent(true); //ooMode?
        }
        break;
    case DOCX:
        try (InputStream is = new Resource(localFilePath).getInputStream()) {
            POITextExtractor extractor = ExtractorFactory.createExtractor(is);
            //XWPFWordExtractor extractor = new XWPFWordExtractor(is);
            text = extractor.getText();
        } catch (InvalidFormatException ife) {
            LOG.error(ife);
        } catch (OpenXML4JException ox4e) {
            LOG.error(ox4e);
        } catch (XmlException xe) {
            LOG.error(xe);
        }
        break;
    case DOC:
        try (InputStream is = new Resource(localFilePath).getInputStream()) {
            POITextExtractor extractor = ExtractorFactory.createExtractor(is);
            //XWPFWordExtractor extractor = new XWPFWordExtractor(is);
            text = extractor.getText();
        } catch (InvalidFormatException ife) {
            LOG.error(ife);
        } catch (OpenXML4JException ox4e) {
            LOG.error(ox4e);
        } catch (XmlException xe) {
            LOG.error(xe);
        }
        break;
    case TXT:
        File file = new Resource(localFilePath);
        text = IOUtils.slurpFileNoExceptions(file);
        break;
    }
    return text;
}

From source file:projetTAL.JPanelNew.java

private void jButtonOkActionPerformed(java.awt.event.ActionEvent evt)
        throws NumberFormatException, FileNotFoundException {//GEN-FIRST:event_jButtonOkActionPerformed
    String titre = this.jTextFieldFichier.getText();
    String texte = new String(IOUtils.slurpFileNoExceptions(titre));
    this.qcm = new QCM(texte, titre, Integer.parseInt(this.jTextFieldNbQuestions.getText()),
            Integer.parseInt(this.jTextFieldNbChoix.getText()), this.pipeline);
    System.out.println(this.qcm.getTitre());

    System.out.print(this.qcm.toText());
    if (jFrameQCM != null)
        jFrameQCM.dispose();//from w  w w.  jav  a  2 s  .  com
    jFrameQCM = new JFrameQCM(this.qcm);
    jFrameQCM.setVisible(true);
    // TODO add your handling code here:
}