Example usage for edu.stanford.nlp.ie.crf CRFClassifier serializeClassifier

Introduction

In this page you can find the example usage for edu.stanford.nlp.ie.crf CRFClassifier serializeClassifier.

Prototype

@Override
public void serializeClassifier(ObjectOutputStream oos)

Source Link

Document

Serialize the classifier to the given ObjectOutputStream.

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java

License:Open Source License

@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {

    if (out != null) {
        IOUtils.closeQuietly(out);//from   w w w . j av  a  2s.  com
    }

    // Load user-provided configuration
    Properties props = new Properties();
    try (InputStream is = new FileInputStream(propertiesFile)) {
        props.load(is);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }

    // Add/replace training file information
    props.setProperty("serializeTo", targetLocation.getAbsolutePath());

    // set training data info
    props.setProperty("trainFile", tempData.getAbsolutePath());
    props.setProperty("map", "word=0,answer=1");
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    // label set
    flags.entitySubclassification = entitySubClassification;
    // if representation should be kept
    flags.retainEntitySubclassification = retainClassification;
    // need to use this reader because the other ones don't recognize the previous settings
    // about the label set
    flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter";

    // Train
    CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    getLogger().info("Starting to train...");
    crf.train();

    try {
        getLogger().info(String.format("Serializing classifier to target location: %s",
                targetLocation.getCanonicalPath()));
        crf.serializeClassifier(targetLocation.getAbsolutePath());
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
}

From source file:edu.cmu.geolocator.nlp.ner.StanfordCRF.CRF_Learn.java

License:Apache License

public static void main(String[] argc) throws Exception {
    //StringUtils.printErrInvocationString("CRFClassifier", args);

    String[] args = new String[2];
    args[0] = "-prop";
    args[1] = "src/edu/cmu/geoparser/nlptools/ner/StanfordCRF/train.prop";

    Properties props = StringUtils.argsToProperties(args);
    CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props);
    String testFile = crf.flags.testFile;
    String testFiles = crf.flags.testFiles;
    String textFile = crf.flags.textFile;
    String textFiles = crf.flags.textFiles;
    String loadPath = crf.flags.loadClassifier;
    String loadTextPath = crf.flags.loadTextClassifier;
    String serializeTo = crf.flags.serializeTo;
    String serializeToText = crf.flags.serializeToText;

    if (loadPath != null) {
        crf.loadClassifierNoExceptions(loadPath, props);
    } else if (loadTextPath != null) {
        System.err.println("Warning: this is now only tested for Chinese Segmenter");
        System.err.println("(Sun Dec 23 00:59:39 2007) (pichuan)");
        try {/*  w  ww . j  a va 2 s .  com*/
            crf.loadTextClassifier(loadTextPath, props);
            // System.err.println("DEBUG: out from crf.loadTextClassifier");
        } catch (Exception e) {
            throw new RuntimeException("error loading " + loadTextPath, e);
        }
    } else if (crf.flags.loadJarClassifier != null) {
        crf.loadJarClassifier(crf.flags.loadJarClassifier, props);
    } else if (crf.flags.trainFile != null || crf.flags.trainFileList != null) {

        //Wei Zhang: This is where the program starts to train.
        crf.train();
        ////////////

    } else {
        crf.loadDefaultClassifier();
    }

    // System.err.println("Using " + crf.flags.featureFactory);
    // System.err.println("Using " +
    // StringUtils.getShortClassName(crf.readerAndWriter));

    if (serializeTo != null) {

        //Wei Zhang: This is used.
        crf.serializeClassifier(serializeTo);
        /////////////////////////
    }

    if (serializeToText != null) {
        crf.serializeTextClassifier(serializeToText);
    }

    if (testFile != null) {
        DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter();
        if (crf.flags.searchGraphPrefix != null) {
            crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix,
                    crf.makeReaderAndWriter());
        } else if (crf.flags.printFirstOrderProbs) {
            crf.printFirstOrderProbs(testFile, readerAndWriter);
        } else if (crf.flags.printProbs) {
            crf.printProbs(testFile, readerAndWriter);
        } else if (crf.flags.useKBest) {
            int k = crf.flags.kBest;
            crf.classifyAndWriteAnswersKBest(testFile, k, readerAndWriter);
        } else if (crf.flags.printLabelValue) {
            crf.printLabelInformation(testFile, readerAndWriter);
        } else {
            crf.classifyAndWriteAnswers(testFile, readerAndWriter);
        }
    }

    if (testFiles != null) {
        List<File> files = new ArrayList<File>();
        for (String filename : testFiles.split(",")) {
            files.add(new File(filename));
        }
        crf.classifyAndWriteAnswers(files, crf.defaultReaderAndWriter());
    }

    if (textFile != null) {
        crf.classifyAndWriteAnswers(textFile);
    }

    if (textFiles != null) {
        List<File> files = new ArrayList<File>();
        for (String filename : textFiles.split(",")) {
            files.add(new File(filename));
        }
        crf.classifyAndWriteAnswers(files);
    }

    if (crf.flags.readStdin) {
        crf.classifyStdin();
    }
}

From source file:org.exist.xquery.corenlp.TrainClassifier.java

License:Open Source License

private void trainClassifier(Collection<List<CoreLabel>> documents, final InputDocType inputFormat) {
    final Properties props = new Properties();
    // fixme! - check ocrTrain configurable under other name?
    //props.setProperty("ocrTrain", "true");
    //props.setProperty("serializeTo", tempOutFile.toAbsolutePath().toString());
    props.setProperty("useClassFeature", "true");
    props.setProperty("useWord", "true");
    props.setProperty("useNGrams", "true");
    props.setProperty("noMidNGrams", "true");
    props.setProperty("useDisjunctive", "true");
    props.setProperty("maxNGramLeng", "6");
    props.setProperty("usePrev", "true");
    props.setProperty("useNext", "true");
    props.setProperty("useSequences", "true");
    props.setProperty("usePrevSequences", "true");
    props.setProperty("maxLeft", "1");
    props.setProperty("useTypeSeqs", "true");
    props.setProperty("useTypeSeqs2", "true");
    props.setProperty("useTypeySequences", "true");
    props.setProperty("wordShape", "chris2useLC");

    CRFClassifier<CoreLabel> classifier = new CRFClassifier(props);
    classifier.train(documents, new ColumnDocumentReaderAndWriter());
    classifier.serializeClassifier(tempOutFile.toAbsolutePath().toString());
}

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}//w w w  .  j a  v  a 2 s .  c  om
 */
@Override
protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception {
    // check input data
    assert ((data != null) && (data[0] != null));

    // get data table
    final BufferedDataTable docTable = (BufferedDataTable) data[0];

    // get dictionary as string and regex pattern
    final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]);
    final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()//
            .map(s -> Pattern.compile(s))//
            .collect(Collectors.toSet());

    // create tag for document tagger
    final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue());

    // create tagger based on known entities
    final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet,
            tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue());

    // create UUID to add them to the file path to avoid cases..
    // .. where two instances of the node model used the same file path at the same time
    final String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz";
    final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create files based on sentence list and known entities
    final File annotatedDocFile = new File(annotatedDocPath);
    int rowCounter = 0;
    int missingValueCounter = 0;
    try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) {
        final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue());
        // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation
        for (final DataRow row : docTable) {
            //set progress bar
            rowCounter++;
            final double progress = (rowCounter / (double) docTable.size()) / (2.0);
            exec.setProgress(progress, "Preparing documents");

            if (!row.getCell(colIndex).isMissing()
                    && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument();
                final Document taggedDoc = tagger.tag(doc);
                taggedDoc.sentenceIterator().forEachRemaining(
                        s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter));
            } else {
                missingValueCounter++;
            }
        }
    }

    // train model
    exec.setProgress(0.75, "Learning model.");

    final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath,
            m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(),
            m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(),
            m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(),
            m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(),
            m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(),
            m_wordShape.getStringValue()).getPropFile();
    final SeqClassifierFlags flags = new SeqClassifierFlags(props);
    final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    crf.train();
    crf.serializeClassifier(modelPath);

    final File outputModel = new File(modelPath);
    final byte[] modelOutputBuffer = Files.toByteArray(outputModel);

    // delete temporary files
    java.nio.file.Files.delete(outputModel.toPath());
    java.nio.file.Files.delete(annotatedDocFile.toPath());

    // set warning messages if necessary
    if (knownEntitiesPatternSet.isEmpty()) {
        setWarningMessage("Trained model on empty dictionary.");
    } else if (rowCounter == 0) {
        setWarningMessage("Node created an empty model.");
    } else if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet,
            m_tokenizer.getStringValue()) };
}

From source file:tr.edu.gsu.nerwip.recognition.internal.modelbased.stanford.StanfordTrainer.java

License:Open Source License

@Override
protected void train(List<List<CoreLabel>> data) throws Exception {
    logger.increaseOffset();//  www.ja v a 2  s .  com

    logger.log("Init training objects");
    // retreive properties object
    Properties properties = setUpProperties();
    // create classifier object
    CRFClassifier<CoreLabel> model = new CRFClassifier<CoreLabel>(properties);

    // perform training
    logger.log("Perform training");
    model.train(data);

    // record model
    logger.log("Record resulting model");
    String modelFile = modelName.getModelFile();
    model.serializeClassifier(modelFile);

    logger.log("Training and recording complete");
    logger.decreaseOffset();
}