Example usage for edu.stanford.nlp.sequences SeqClassifierFlags SeqClassifierFlags

List of usage examples for edu.stanford.nlp.sequences SeqClassifierFlags SeqClassifierFlags

Introduction

In this page you can find the example usage for edu.stanford.nlp.sequences SeqClassifierFlags SeqClassifierFlags.

Prototype

public SeqClassifierFlags(Properties props) 

Source Link

Document

Create a new SeqClassifierFlags object and initialize it using values in the Properties object.

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java

License:Open Source License

@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {

    if (out != null) {
        IOUtils.closeQuietly(out);/*from w  w w  .j  a  v a 2s .c o  m*/
    }

    // Load user-provided configuration
    Properties props = new Properties();
    try (InputStream is = new FileInputStream(propertiesFile)) {
        props.load(is);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }

    // Add/replace training file information
    props.setProperty("serializeTo", targetLocation.getAbsolutePath());

    // set training data info
    props.setProperty("trainFile", tempData.getAbsolutePath());
    props.setProperty("map", "word=0,answer=1");
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    // label set
    flags.entitySubclassification = entitySubClassification;
    // if representation should be kept
    flags.retainEntitySubclassification = retainClassification;
    // need to use this reader because the other ones don't recognize the previous settings
    // about the label set
    flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter";

    // Train
    CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    getLogger().info("Starting to train...");
    crf.train();

    try {
        getLogger().info(String.format("Serializing classifier to target location: %s",
                targetLocation.getCanonicalPath()));
        crf.serializeClassifier(targetLocation.getAbsolutePath());
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
}

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}//from  w w  w  .  ja  va 2 s. c o m
 */
@Override
protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception {
    // check input data
    assert ((data != null) && (data[0] != null));

    // get data table
    final BufferedDataTable docTable = (BufferedDataTable) data[0];

    // get dictionary as string and regex pattern
    final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]);
    final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()//
            .map(s -> Pattern.compile(s))//
            .collect(Collectors.toSet());

    // create tag for document tagger
    final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue());

    // create tagger based on known entities
    final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet,
            tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue());

    // create UUID to add them to the file path to avoid cases..
    // .. where two instances of the node model used the same file path at the same time
    final String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz";
    final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create files based on sentence list and known entities
    final File annotatedDocFile = new File(annotatedDocPath);
    int rowCounter = 0;
    int missingValueCounter = 0;
    try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) {
        final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue());
        // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation
        for (final DataRow row : docTable) {
            //set progress bar
            rowCounter++;
            final double progress = (rowCounter / (double) docTable.size()) / (2.0);
            exec.setProgress(progress, "Preparing documents");

            if (!row.getCell(colIndex).isMissing()
                    && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument();
                final Document taggedDoc = tagger.tag(doc);
                taggedDoc.sentenceIterator().forEachRemaining(
                        s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter));
            } else {
                missingValueCounter++;
            }
        }
    }

    // train model
    exec.setProgress(0.75, "Learning model.");

    final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath,
            m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(),
            m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(),
            m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(),
            m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(),
            m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(),
            m_wordShape.getStringValue()).getPropFile();
    final SeqClassifierFlags flags = new SeqClassifierFlags(props);
    final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    crf.train();
    crf.serializeClassifier(modelPath);

    final File outputModel = new File(modelPath);
    final byte[] modelOutputBuffer = Files.toByteArray(outputModel);

    // delete temporary files
    java.nio.file.Files.delete(outputModel.toPath());
    java.nio.file.Files.delete(annotatedDocFile.toPath());

    // set warning messages if necessary
    if (knownEntitiesPatternSet.isEmpty()) {
        setWarningMessage("Trained model on empty dictionary.");
    } else if (rowCounter == 0) {
        setWarningMessage("Node created an empty model.");
    } else if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet,
            m_tokenizer.getStringValue()) };
}