Example usage for edu.stanford.nlp.sequences SeqClassifierFlags SeqClassifierFlags

Introduction

In this page you can find the example usage for edu.stanford.nlp.sequences SeqClassifierFlags SeqClassifierFlags.

Prototype

public SeqClassifierFlags(Properties props)

Source Link

Document

Create a new SeqClassifierFlags object and initialize it using values in the Properties object.

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java

License:Open Source License

@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {

    if (out != null) {
        IOUtils.closeQuietly(out);/*from w  w w  .j  a  v a 2s .c o  m*/
    }

    // Load user-provided configuration
    Properties props = new Properties();
    try (InputStream is = new FileInputStream(propertiesFile)) {
        props.load(is);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }

    // Add/replace training file information
    props.setProperty("serializeTo", targetLocation.getAbsolutePath());

    // set training data info
    props.setProperty("trainFile", tempData.getAbsolutePath());
    props.setProperty("map", "word=0,answer=1");
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    // label set
    flags.entitySubclassification = entitySubClassification;
    // if representation should be kept
    flags.retainEntitySubclassification = retainClassification;
    // need to use this reader because the other ones don't recognize the previous settings
    // about the label set
    flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter";

    // Train
    CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    getLogger().info("Starting to train...");
    crf.train();

    try {
        getLogger().info(String.format("Serializing classifier to target location: %s",
                targetLocation.getCanonicalPath()));
        crf.serializeClassifier(targetLocation.getAbsolutePath());
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
}

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}//from  w w  w  .  ja  va 2 s. c o m
 */
@Override
protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception {
    // check input data
    assert ((data != null) && (data[0] != null));

    // get data table
    final BufferedDataTable docTable = (BufferedDataTable) data[0];

    // get dictionary as string and regex pattern
    final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]);
    final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()//
            .map(s -> Pattern.compile(s))//
            .collect(Collectors.toSet());

    // create tag for document tagger
    final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue());

    // create tagger based on known entities
    final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet,
            tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue());

    // create UUID to add them to the file path to avoid cases..
    // .. where two instances of the node model used the same file path at the same time
    final String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz";
    final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create files based on sentence list and known entities
    final File annotatedDocFile = new File(annotatedDocPath);
    int rowCounter = 0;
    int missingValueCounter = 0;
    try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) {
        final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue());
        // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation
        for (final DataRow row : docTable) {
            //set progress bar
            rowCounter++;
            final double progress = (rowCounter / (double) docTable.size()) / (2.0);
            exec.setProgress(progress, "Preparing documents");

            if (!row.getCell(colIndex).isMissing()
                    && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument();
                final Document taggedDoc = tagger.tag(doc);
                taggedDoc.sentenceIterator().forEachRemaining(
                        s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter));
            } else {
                missingValueCounter++;
            }
        }
    }

    // train model
    exec.setProgress(0.75, "Learning model.");

    final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath,
            m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(),
            m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(),
            m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(),
            m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(),
            m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(),
            m_wordShape.getStringValue()).getPropFile();
    final SeqClassifierFlags flags = new SeqClassifierFlags(props);
    final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    crf.train();
    crf.serializeClassifier(modelPath);

    final File outputModel = new File(modelPath);
    final byte[] modelOutputBuffer = Files.toByteArray(outputModel);

    // delete temporary files
    java.nio.file.Files.delete(outputModel.toPath());
    java.nio.file.Files.delete(annotatedDocFile.toPath());

    // set warning messages if necessary
    if (knownEntitiesPatternSet.isEmpty()) {
        setWarningMessage("Trained model on empty dictionary.");
    } else if (rowCounter == 0) {
        setWarningMessage("Node created an empty model.");
    } else if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet,
            m_tokenizer.getStringValue()) };
}