Example usage for edu.stanford.nlp.util.logging RedwoodConfiguration handlers

Introduction

In this page you can find the example usage for edu.stanford.nlp.util.logging RedwoodConfiguration handlers.

Prototype

public RedwoodConfiguration handlers(Thunk... paths)

Source Link

Document

Add handlers to Redwood.

Usage

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnescorer.StanfordNlpNeScorerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}/* w w  w .  j  av  a  2s .  com*/
 */
@Override
protected PortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception {

    m_inputModelPortObject = (StanfordNERModelPortObject) inObjects[1];
    m_inputModel = m_inputModelPortObject.getNERModel();
    m_usedDict = m_inputModelPortObject.getDictSet();
    m_tag = m_inputModelPortObject.getTag();
    m_tokenizerName = m_inputModelPortObject.getTokenizerName();

    //create a BufferedDataContainer for the scoring values
    BufferedDataContainer accTable = exec.createDataContainer(new DataTableSpec(QUALITY_MEASURES_SPECS));

    // build pattern set from dictionary
    DataTableSpec docTableSpec = (DataTableSpec) inObjects[0].getSpec();
    BufferedDataTable docDataInput = (BufferedDataTable) inObjects[0];
    Set<Pattern> knownEntitiesPatternSet = new LinkedHashSet<Pattern>();
    for (String word : m_usedDict) {
        knownEntitiesPatternSet.add(Pattern.compile(word));
    }

    // create dictionary tagger to tag the input documents with the dictionary used for building the model
    MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, m_tag,
            true, m_tokenizerName);

    // create UUID to add them to the file path to avoid cases where two instances of the node model used the same file path at the same time
    String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    String m_annotatedTestFilePath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create the annotated test file
    File m_annotatedTestFile = new File(m_annotatedTestFilePath);
    PrintWriter sentenceFileWriter = new PrintWriter(m_annotatedTestFile, "UTF-8");

    int missingValueCounter = 0;

    // tag documents and transform sentences to strings while tagged terms get StanfordNLP annotation
    // iterate through columns
    for (int i = 0; i < docTableSpec.getNumColumns(); i++) {
        // iterate through rows if column with correct name has been found
        if (docTableSpec.getColumnSpec(i).getName().equals(m_docColumnModel.getStringValue())) {
            int counter = 0;
            Set<String> countMultiWordTerms = new HashSet<String>();
            for (DataRow row : docDataInput) {
                //set progress bar
                counter++;
                double progress = (counter / (double) docDataInput.size()) / (3.0);
                exec.setProgress(progress, "Preparing documents for validation");
                exec.checkCanceled();

                if (!row.getCell(i).isMissing() && row.getCell(i).getType().isCompatible(DocumentValue.class)) {
                    Document doc = ((DocumentValue) row.getCell(i)).getDocument();
                    Document taggedDoc = tagger.tag(doc);
                    Iterator<Sentence> si = taggedDoc.sentenceIterator();
                    while (si.hasNext()) {
                        Sentence s = si.next();
                        List<Term> termList = s.getTerms();
                        Iterator<Term> ti = termList.iterator();
                        while (ti.hasNext()) {
                            Term t = ti.next();
                            String termText = t.getText();
                            String termTextWithWsSuffix = t.getTextWithWsSuffix();
                            if (m_usedDict.contains(termText) || m_usedDict.contains(termTextWithWsSuffix)) {
                                if (t.getWords().size() > 1) {
                                    // multi-word terms should not be written in one line in the training file
                                    countMultiWordTerms.add(t.getText());

                                    // so skip it by splitting the term and writing each word in one line
                                    for (Word w : t.getWords()) {
                                        sentenceFileWriter.println(w.getText() + "\t" + m_tag.getTagValue());
                                    }
                                } else {
                                    sentenceFileWriter.println(termText + "\t" + m_tag.getTagValue());
                                }
                            } else if (!m_usedDict.contains(termText)
                                    || !m_usedDict.contains(termTextWithWsSuffix)) {
                                sentenceFileWriter.println(termText + "\tO");
                            }
                        }
                    }
                } else {
                    missingValueCounter++;
                }
            }
        }
    }

    if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    sentenceFileWriter.close();

    exec.setProgress(0.5, "Validate model");
    // create logger configuration and catch the scores which will be printed to the log file
    File tmpLogFile = new File(KNIMEConstants.getKNIMETempDir() + "/scores.log");
    RedwoodConfiguration conf = RedwoodConfiguration.empty();
    conf.handlers(Handlers.chain(Handlers.hideDebug, Handlers.file(tmpLogFile))).apply();

    // classify the documents with our model
    DocumentReaderAndWriter<CoreLabel> raw = m_inputModel.makeReaderAndWriter();
    Triple<Double, Double, Double> prfScores = m_inputModel.classifyAndWriteAnswers(m_annotatedTestFilePath,
            new ByteArrayOutputStream(), raw, true);

    DataRow stats = new DefaultRow(new RowKey("Row0"),
            new DataCell[] { DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(),
                    DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell() });

    ReversedLinesFileReader logReader = new ReversedLinesFileReader(tmpLogFile, StandardCharsets.UTF_8);

    try {
        // get values from output stream
        String[] scores = logReader.readLine().split("\t");
        if (scores.length >= 7) {
            Double precision = prfScores.first() / 100;
            Double recall = prfScores.second() / 100;
            Double f1 = prfScores.third() / 100;
            int tp = Integer.parseInt(scores[4].trim());
            int fp = Integer.parseInt(scores[5].trim());
            int fn = Integer.parseInt(scores[6].trim());
            // create the scores row and add it to the BufferedDataContainer we created in the beginning
            stats = new DefaultRow(new RowKey("Row0"),
                    new DataCell[] { new DoubleCell(precision), new DoubleCell(recall), new DoubleCell(f1),
                            new IntCell(tp), new IntCell(fp), new IntCell(fn) });
            if (tp == 0 && fp == 0 && fn == 0 && precision == 0 && recall == 1 && f1 == 0) {
                setWarningMessage("Could not parse quality measures of model validation.");
            }
        }
    } catch (NumberFormatException e) {
        setWarningMessage("Could not parse quality measures of model validation.");
    } finally {
        logReader.close();
        tmpLogFile.delete();
        m_annotatedTestFile.delete();
    }
    accTable.addRowToTable(stats);

    accTable.close();

    return new BufferedDataTable[] { accTable.getTable() };
}