List of usage examples for edu.stanford.nlp.ie.crf CRFClassifier serializeClassifier
@Override public void serializeClassifier(ObjectOutputStream oos)
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java
License:Open Source License
@Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (out != null) { IOUtils.closeQuietly(out);//from w w w . j av a 2s. com } // Load user-provided configuration Properties props = new Properties(); try (InputStream is = new FileInputStream(propertiesFile)) { props.load(is); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // Add/replace training file information props.setProperty("serializeTo", targetLocation.getAbsolutePath()); // set training data info props.setProperty("trainFile", tempData.getAbsolutePath()); props.setProperty("map", "word=0,answer=1"); SeqClassifierFlags flags = new SeqClassifierFlags(props); // label set flags.entitySubclassification = entitySubClassification; // if representation should be kept flags.retainEntitySubclassification = retainClassification; // need to use this reader because the other ones don't recognize the previous settings // about the label set flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter"; // Train CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags); getLogger().info("Starting to train..."); crf.train(); try { getLogger().info(String.format("Serializing classifier to target location: %s", targetLocation.getCanonicalPath())); crf.serializeClassifier(targetLocation.getAbsolutePath()); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } }
From source file:edu.cmu.geolocator.nlp.ner.StanfordCRF.CRF_Learn.java
License:Apache License
public static void main(String[] argc) throws Exception { //StringUtils.printErrInvocationString("CRFClassifier", args); String[] args = new String[2]; args[0] = "-prop"; args[1] = "src/edu/cmu/geoparser/nlptools/ner/StanfordCRF/train.prop"; Properties props = StringUtils.argsToProperties(args); CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props); String testFile = crf.flags.testFile; String testFiles = crf.flags.testFiles; String textFile = crf.flags.textFile; String textFiles = crf.flags.textFiles; String loadPath = crf.flags.loadClassifier; String loadTextPath = crf.flags.loadTextClassifier; String serializeTo = crf.flags.serializeTo; String serializeToText = crf.flags.serializeToText; if (loadPath != null) { crf.loadClassifierNoExceptions(loadPath, props); } else if (loadTextPath != null) { System.err.println("Warning: this is now only tested for Chinese Segmenter"); System.err.println("(Sun Dec 23 00:59:39 2007) (pichuan)"); try {/* w ww . j a va 2 s . com*/ crf.loadTextClassifier(loadTextPath, props); // System.err.println("DEBUG: out from crf.loadTextClassifier"); } catch (Exception e) { throw new RuntimeException("error loading " + loadTextPath, e); } } else if (crf.flags.loadJarClassifier != null) { crf.loadJarClassifier(crf.flags.loadJarClassifier, props); } else if (crf.flags.trainFile != null || crf.flags.trainFileList != null) { //Wei Zhang: This is where the program starts to train. crf.train(); //////////// } else { crf.loadDefaultClassifier(); } // System.err.println("Using " + crf.flags.featureFactory); // System.err.println("Using " + // StringUtils.getShortClassName(crf.readerAndWriter)); if (serializeTo != null) { //Wei Zhang: This is used. crf.serializeClassifier(serializeTo); ///////////////////////// } if (serializeToText != null) { crf.serializeTextClassifier(serializeToText); } if (testFile != null) { DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter(); if (crf.flags.searchGraphPrefix != null) { crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, crf.makeReaderAndWriter()); } else if (crf.flags.printFirstOrderProbs) { crf.printFirstOrderProbs(testFile, readerAndWriter); } else if (crf.flags.printProbs) { crf.printProbs(testFile, readerAndWriter); } else if (crf.flags.useKBest) { int k = crf.flags.kBest; crf.classifyAndWriteAnswersKBest(testFile, k, readerAndWriter); } else if (crf.flags.printLabelValue) { crf.printLabelInformation(testFile, readerAndWriter); } else { crf.classifyAndWriteAnswers(testFile, readerAndWriter); } } if (testFiles != null) { List<File> files = new ArrayList<File>(); for (String filename : testFiles.split(",")) { files.add(new File(filename)); } crf.classifyAndWriteAnswers(files, crf.defaultReaderAndWriter()); } if (textFile != null) { crf.classifyAndWriteAnswers(textFile); } if (textFiles != null) { List<File> files = new ArrayList<File>(); for (String filename : textFiles.split(",")) { files.add(new File(filename)); } crf.classifyAndWriteAnswers(files); } if (crf.flags.readStdin) { crf.classifyStdin(); } }
From source file:org.exist.xquery.corenlp.TrainClassifier.java
License:Open Source License
private void trainClassifier(Collection<List<CoreLabel>> documents, final InputDocType inputFormat) { final Properties props = new Properties(); // fixme! - check ocrTrain configurable under other name? //props.setProperty("ocrTrain", "true"); //props.setProperty("serializeTo", tempOutFile.toAbsolutePath().toString()); props.setProperty("useClassFeature", "true"); props.setProperty("useWord", "true"); props.setProperty("useNGrams", "true"); props.setProperty("noMidNGrams", "true"); props.setProperty("useDisjunctive", "true"); props.setProperty("maxNGramLeng", "6"); props.setProperty("usePrev", "true"); props.setProperty("useNext", "true"); props.setProperty("useSequences", "true"); props.setProperty("usePrevSequences", "true"); props.setProperty("maxLeft", "1"); props.setProperty("useTypeSeqs", "true"); props.setProperty("useTypeSeqs2", "true"); props.setProperty("useTypeySequences", "true"); props.setProperty("wordShape", "chris2useLC"); CRFClassifier<CoreLabel> classifier = new CRFClassifier(props); classifier.train(documents, new ColumnDocumentReaderAndWriter()); classifier.serializeClassifier(tempOutFile.toAbsolutePath().toString()); }
From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java
License:Open Source License
/** * {@inheritDoc}//w w w . j a v a 2 s . c om */ @Override protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception { // check input data assert ((data != null) && (data[0] != null)); // get data table final BufferedDataTable docTable = (BufferedDataTable) data[0]; // get dictionary as string and regex pattern final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]); final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()// .map(s -> Pattern.compile(s))// .collect(Collectors.toSet()); // create tag for document tagger final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue()); // create tagger based on known entities final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue()); // create UUID to add them to the file path to avoid cases.. // .. where two instances of the node model used the same file path at the same time final String tempDir = KNIMEConstants.getKNIMETempDir() + "/"; final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz"; final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv"; // create files based on sentence list and known entities final File annotatedDocFile = new File(annotatedDocPath); int rowCounter = 0; int missingValueCounter = 0; try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) { final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue()); // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation for (final DataRow row : docTable) { //set progress bar rowCounter++; final double progress = (rowCounter / (double) docTable.size()) / (2.0); exec.setProgress(progress, "Preparing documents"); if (!row.getCell(colIndex).isMissing() && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) { final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument(); final Document taggedDoc = tagger.tag(doc); taggedDoc.sentenceIterator().forEachRemaining( s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter)); } else { missingValueCounter++; } } } // train model exec.setProgress(0.75, "Learning model."); final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath, m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(), m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(), m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(), m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(), m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(), m_wordShape.getStringValue()).getPropFile(); final SeqClassifierFlags flags = new SeqClassifierFlags(props); final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags); crf.train(); crf.serializeClassifier(modelPath); final File outputModel = new File(modelPath); final byte[] modelOutputBuffer = Files.toByteArray(outputModel); // delete temporary files java.nio.file.Files.delete(outputModel.toPath()); java.nio.file.Files.delete(annotatedDocFile.toPath()); // set warning messages if necessary if (knownEntitiesPatternSet.isEmpty()) { setWarningMessage("Trained model on empty dictionary."); } else if (rowCounter == 0) { setWarningMessage("Node created an empty model."); } else if (missingValueCounter == 1) { setWarningMessage(missingValueCounter + " row has been ignored due to missing value."); } else if (missingValueCounter > 1) { setWarningMessage(missingValueCounter + " rows have been ignored due to missing values."); } return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet, m_tokenizer.getStringValue()) }; }
From source file:tr.edu.gsu.nerwip.recognition.internal.modelbased.stanford.StanfordTrainer.java
License:Open Source License
@Override protected void train(List<List<CoreLabel>> data) throws Exception { logger.increaseOffset();// www.ja v a 2 s . com logger.log("Init training objects"); // retreive properties object Properties properties = setUpProperties(); // create classifier object CRFClassifier<CoreLabel> model = new CRFClassifier<CoreLabel>(properties); // perform training logger.log("Perform training"); model.train(data); // record model logger.log("Record resulting model"); String modelFile = modelName.getModelFile(); model.serializeClassifier(modelFile); logger.log("Training and recording complete"); logger.decreaseOffset(); }