List of usage examples for edu.stanford.nlp.io IOUtils slurpFileNoExceptions
public static String slurpFileNoExceptions(String filename)
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>/*from w w w .ja va 2s .co m*/ * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor.java
License:Apache License
/** * Reads an annotation from the given filename using the requested input. *//*from w w w.ja v a 2 s. c o m*/ public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
From source file:org.exist.xquery.corenlp.Tokenize.java
License:Open Source License
private String readLocalTextDocument(final TextDocType textDocType, final String localFilePath) throws IOException { String text = ""; switch (textDocType) { case ODT:// www .ja va 2s . c o m try (InputStream is = new Resource(localFilePath).getInputStream()) { TextDocument utd = ODPackage.createFromStream(is, "UserTextDocument").getTextDocument(); text = utd.getCharacterContent(true); //ooMode? } break; case DOCX: try (InputStream is = new Resource(localFilePath).getInputStream()) { POITextExtractor extractor = ExtractorFactory.createExtractor(is); //XWPFWordExtractor extractor = new XWPFWordExtractor(is); text = extractor.getText(); } catch (InvalidFormatException ife) { LOG.error(ife); } catch (OpenXML4JException ox4e) { LOG.error(ox4e); } catch (XmlException xe) { LOG.error(xe); } break; case DOC: try (InputStream is = new Resource(localFilePath).getInputStream()) { POITextExtractor extractor = ExtractorFactory.createExtractor(is); //XWPFWordExtractor extractor = new XWPFWordExtractor(is); text = extractor.getText(); } catch (InvalidFormatException ife) { LOG.error(ife); } catch (OpenXML4JException ox4e) { LOG.error(ox4e); } catch (XmlException xe) { LOG.error(xe); } break; case TXT: File file = new Resource(localFilePath); text = IOUtils.slurpFileNoExceptions(file); break; } return text; }
From source file:projetTAL.JPanelNew.java
private void jButtonOkActionPerformed(java.awt.event.ActionEvent evt) throws NumberFormatException, FileNotFoundException {//GEN-FIRST:event_jButtonOkActionPerformed String titre = this.jTextFieldFichier.getText(); String texte = new String(IOUtils.slurpFileNoExceptions(titre)); this.qcm = new QCM(texte, titre, Integer.parseInt(this.jTextFieldNbQuestions.getText()), Integer.parseInt(this.jTextFieldNbChoix.getText()), this.pipeline); System.out.println(this.qcm.getTitre()); System.out.print(this.qcm.toText()); if (jFrameQCM != null) jFrameQCM.dispose();//from w w w. jav a 2 s . com jFrameQCM = new JFrameQCM(this.qcm); jFrameQCM.setVisible(true); // TODO add your handling code here: }