List of usage examples for edu.stanford.nlp.util.logging RedwoodConfiguration handlers
public RedwoodConfiguration handlers(Thunk... paths)
From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnescorer.StanfordNlpNeScorerNodeModel.java
License:Open Source License
/** * {@inheritDoc}/* w w w . j av a 2s . com*/ */ @Override protected PortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception { m_inputModelPortObject = (StanfordNERModelPortObject) inObjects[1]; m_inputModel = m_inputModelPortObject.getNERModel(); m_usedDict = m_inputModelPortObject.getDictSet(); m_tag = m_inputModelPortObject.getTag(); m_tokenizerName = m_inputModelPortObject.getTokenizerName(); //create a BufferedDataContainer for the scoring values BufferedDataContainer accTable = exec.createDataContainer(new DataTableSpec(QUALITY_MEASURES_SPECS)); // build pattern set from dictionary DataTableSpec docTableSpec = (DataTableSpec) inObjects[0].getSpec(); BufferedDataTable docDataInput = (BufferedDataTable) inObjects[0]; Set<Pattern> knownEntitiesPatternSet = new LinkedHashSet<Pattern>(); for (String word : m_usedDict) { knownEntitiesPatternSet.add(Pattern.compile(word)); } // create dictionary tagger to tag the input documents with the dictionary used for building the model MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, m_tag, true, m_tokenizerName); // create UUID to add them to the file path to avoid cases where two instances of the node model used the same file path at the same time String tempDir = KNIMEConstants.getKNIMETempDir() + "/"; String m_annotatedTestFilePath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv"; // create the annotated test file File m_annotatedTestFile = new File(m_annotatedTestFilePath); PrintWriter sentenceFileWriter = new PrintWriter(m_annotatedTestFile, "UTF-8"); int missingValueCounter = 0; // tag documents and transform sentences to strings while tagged terms get StanfordNLP annotation // iterate through columns for (int i = 0; i < docTableSpec.getNumColumns(); i++) { // iterate through rows if column with correct name has been found if (docTableSpec.getColumnSpec(i).getName().equals(m_docColumnModel.getStringValue())) { int counter = 0; Set<String> countMultiWordTerms = new HashSet<String>(); for (DataRow row : docDataInput) { //set progress bar counter++; double progress = (counter / (double) docDataInput.size()) / (3.0); exec.setProgress(progress, "Preparing documents for validation"); exec.checkCanceled(); if (!row.getCell(i).isMissing() && row.getCell(i).getType().isCompatible(DocumentValue.class)) { Document doc = ((DocumentValue) row.getCell(i)).getDocument(); Document taggedDoc = tagger.tag(doc); Iterator<Sentence> si = taggedDoc.sentenceIterator(); while (si.hasNext()) { Sentence s = si.next(); List<Term> termList = s.getTerms(); Iterator<Term> ti = termList.iterator(); while (ti.hasNext()) { Term t = ti.next(); String termText = t.getText(); String termTextWithWsSuffix = t.getTextWithWsSuffix(); if (m_usedDict.contains(termText) || m_usedDict.contains(termTextWithWsSuffix)) { if (t.getWords().size() > 1) { // multi-word terms should not be written in one line in the training file countMultiWordTerms.add(t.getText()); // so skip it by splitting the term and writing each word in one line for (Word w : t.getWords()) { sentenceFileWriter.println(w.getText() + "\t" + m_tag.getTagValue()); } } else { sentenceFileWriter.println(termText + "\t" + m_tag.getTagValue()); } } else if (!m_usedDict.contains(termText) || !m_usedDict.contains(termTextWithWsSuffix)) { sentenceFileWriter.println(termText + "\tO"); } } } } else { missingValueCounter++; } } } } if (missingValueCounter == 1) { setWarningMessage(missingValueCounter + " row has been ignored due to missing value."); } else if (missingValueCounter > 1) { setWarningMessage(missingValueCounter + " rows have been ignored due to missing values."); } sentenceFileWriter.close(); exec.setProgress(0.5, "Validate model"); // create logger configuration and catch the scores which will be printed to the log file File tmpLogFile = new File(KNIMEConstants.getKNIMETempDir() + "/scores.log"); RedwoodConfiguration conf = RedwoodConfiguration.empty(); conf.handlers(Handlers.chain(Handlers.hideDebug, Handlers.file(tmpLogFile))).apply(); // classify the documents with our model DocumentReaderAndWriter<CoreLabel> raw = m_inputModel.makeReaderAndWriter(); Triple<Double, Double, Double> prfScores = m_inputModel.classifyAndWriteAnswers(m_annotatedTestFilePath, new ByteArrayOutputStream(), raw, true); DataRow stats = new DefaultRow(new RowKey("Row0"), new DataCell[] { DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell() }); ReversedLinesFileReader logReader = new ReversedLinesFileReader(tmpLogFile, StandardCharsets.UTF_8); try { // get values from output stream String[] scores = logReader.readLine().split("\t"); if (scores.length >= 7) { Double precision = prfScores.first() / 100; Double recall = prfScores.second() / 100; Double f1 = prfScores.third() / 100; int tp = Integer.parseInt(scores[4].trim()); int fp = Integer.parseInt(scores[5].trim()); int fn = Integer.parseInt(scores[6].trim()); // create the scores row and add it to the BufferedDataContainer we created in the beginning stats = new DefaultRow(new RowKey("Row0"), new DataCell[] { new DoubleCell(precision), new DoubleCell(recall), new DoubleCell(f1), new IntCell(tp), new IntCell(fp), new IntCell(fn) }); if (tp == 0 && fp == 0 && fn == 0 && precision == 0 && recall == 1 && f1 == 0) { setWarningMessage("Could not parse quality measures of model validation."); } } } catch (NumberFormatException e) { setWarningMessage("Could not parse quality measures of model validation."); } finally { logReader.close(); tmpLogFile.delete(); m_annotatedTestFile.delete(); } accTable.addRowToTable(stats); accTable.close(); return new BufferedDataTable[] { accTable.getTable() }; }