List of usage examples for edu.stanford.nlp.sequences SeqClassifierFlags SeqClassifierFlags
public SeqClassifierFlags(Properties props)
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java
License:Open Source License
@Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (out != null) { IOUtils.closeQuietly(out);/*from w w w .j a v a 2s .c o m*/ } // Load user-provided configuration Properties props = new Properties(); try (InputStream is = new FileInputStream(propertiesFile)) { props.load(is); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // Add/replace training file information props.setProperty("serializeTo", targetLocation.getAbsolutePath()); // set training data info props.setProperty("trainFile", tempData.getAbsolutePath()); props.setProperty("map", "word=0,answer=1"); SeqClassifierFlags flags = new SeqClassifierFlags(props); // label set flags.entitySubclassification = entitySubClassification; // if representation should be kept flags.retainEntitySubclassification = retainClassification; // need to use this reader because the other ones don't recognize the previous settings // about the label set flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter"; // Train CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags); getLogger().info("Starting to train..."); crf.train(); try { getLogger().info(String.format("Serializing classifier to target location: %s", targetLocation.getCanonicalPath())); crf.serializeClassifier(targetLocation.getAbsolutePath()); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } }
From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java
License:Open Source License
/** * {@inheritDoc}//from w w w . ja va 2 s. c o m */ @Override protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception { // check input data assert ((data != null) && (data[0] != null)); // get data table final BufferedDataTable docTable = (BufferedDataTable) data[0]; // get dictionary as string and regex pattern final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]); final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()// .map(s -> Pattern.compile(s))// .collect(Collectors.toSet()); // create tag for document tagger final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue()); // create tagger based on known entities final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue()); // create UUID to add them to the file path to avoid cases.. // .. where two instances of the node model used the same file path at the same time final String tempDir = KNIMEConstants.getKNIMETempDir() + "/"; final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz"; final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv"; // create files based on sentence list and known entities final File annotatedDocFile = new File(annotatedDocPath); int rowCounter = 0; int missingValueCounter = 0; try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) { final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue()); // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation for (final DataRow row : docTable) { //set progress bar rowCounter++; final double progress = (rowCounter / (double) docTable.size()) / (2.0); exec.setProgress(progress, "Preparing documents"); if (!row.getCell(colIndex).isMissing() && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) { final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument(); final Document taggedDoc = tagger.tag(doc); taggedDoc.sentenceIterator().forEachRemaining( s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter)); } else { missingValueCounter++; } } } // train model exec.setProgress(0.75, "Learning model."); final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath, m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(), m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(), m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(), m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(), m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(), m_wordShape.getStringValue()).getPropFile(); final SeqClassifierFlags flags = new SeqClassifierFlags(props); final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags); crf.train(); crf.serializeClassifier(modelPath); final File outputModel = new File(modelPath); final byte[] modelOutputBuffer = Files.toByteArray(outputModel); // delete temporary files java.nio.file.Files.delete(outputModel.toPath()); java.nio.file.Files.delete(annotatedDocFile.toPath()); // set warning messages if necessary if (knownEntitiesPatternSet.isEmpty()) { setWarningMessage("Trained model on empty dictionary."); } else if (rowCounter == 0) { setWarningMessage("Node created an empty model."); } else if (missingValueCounter == 1) { setWarningMessage(missingValueCounter + " row has been ignored due to missing value."); } else if (missingValueCounter > 1) { setWarningMessage(missingValueCounter + " rows have been ignored due to missing values."); } return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet, m_tokenizer.getStringValue()) }; }