Example usage for edu.stanford.nlp.ie.crf CRFClassifier CRFClassifier

List of usage examples for edu.stanford.nlp.ie.crf CRFClassifier CRFClassifier

Introduction

In this page you can find the example usage for edu.stanford.nlp.ie.crf CRFClassifier CRFClassifier.

Prototype

public CRFClassifier(CRFClassifier<IN> crf) 

Source Link

Document

Makes a copy of the crf classifier

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java

License:Open Source License

@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {

    if (out != null) {
        IOUtils.closeQuietly(out);//from   w ww  .ja va2s  .  com
    }

    // Load user-provided configuration
    Properties props = new Properties();
    try (InputStream is = new FileInputStream(propertiesFile)) {
        props.load(is);
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }

    // Add/replace training file information
    props.setProperty("serializeTo", targetLocation.getAbsolutePath());

    // set training data info
    props.setProperty("trainFile", tempData.getAbsolutePath());
    props.setProperty("map", "word=0,answer=1");
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    // label set
    flags.entitySubclassification = entitySubClassification;
    // if representation should be kept
    flags.retainEntitySubclassification = retainClassification;
    // need to use this reader because the other ones don't recognize the previous settings
    // about the label set
    flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter";

    // Train
    CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    getLogger().info("Starting to train...");
    crf.train();

    try {
        getLogger().info(String.format("Serializing classifier to target location: %s",
                targetLocation.getCanonicalPath()));
        crf.serializeClassifier(targetLocation.getAbsolutePath());
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
}

From source file:edu.albany.cubism.util.Segmenter.java

public Segmenter() {
    System.out.println("basedir: " + basedir);
    Properties props = new Properties();
    props.setProperty("sighanCorporaDict", basedir);
    // props.setProperty("NormalizationTable", "data/norm.simp.utf8");
    // props.setProperty("normTableEncoding", "UTF-8");
    // below is needed because CTBSegDocumentIteratorFactory accesses it
    props.setProperty("serDictionary", basedir + "/dict-chris6.ser.gz");
    props.setProperty("inputEncoding", "UTF-8");
    props.setProperty("sighanPostProcessing", "true");

    this.segmenter = new CRFClassifier<CoreLabel>(props);
    segmenter.loadClassifierNoExceptions(basedir + "/ctb.gz", props);

}

From source file:edu.cmu.geolocator.nlp.ner.StanfordCRF.CRF_Learn.java

License:Apache License

public static void main(String[] argc) throws Exception {
    //StringUtils.printErrInvocationString("CRFClassifier", args);

    String[] args = new String[2];
    args[0] = "-prop";
    args[1] = "src/edu/cmu/geoparser/nlptools/ner/StanfordCRF/train.prop";

    Properties props = StringUtils.argsToProperties(args);
    CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props);
    String testFile = crf.flags.testFile;
    String testFiles = crf.flags.testFiles;
    String textFile = crf.flags.textFile;
    String textFiles = crf.flags.textFiles;
    String loadPath = crf.flags.loadClassifier;
    String loadTextPath = crf.flags.loadTextClassifier;
    String serializeTo = crf.flags.serializeTo;
    String serializeToText = crf.flags.serializeToText;

    if (loadPath != null) {
        crf.loadClassifierNoExceptions(loadPath, props);
    } else if (loadTextPath != null) {
        System.err.println("Warning: this is now only tested for Chinese Segmenter");
        System.err.println("(Sun Dec 23 00:59:39 2007) (pichuan)");
        try {/*from  w  w w .j  ava  2  s.com*/
            crf.loadTextClassifier(loadTextPath, props);
            // System.err.println("DEBUG: out from crf.loadTextClassifier");
        } catch (Exception e) {
            throw new RuntimeException("error loading " + loadTextPath, e);
        }
    } else if (crf.flags.loadJarClassifier != null) {
        crf.loadJarClassifier(crf.flags.loadJarClassifier, props);
    } else if (crf.flags.trainFile != null || crf.flags.trainFileList != null) {

        //Wei Zhang: This is where the program starts to train.
        crf.train();
        ////////////

    } else {
        crf.loadDefaultClassifier();
    }

    // System.err.println("Using " + crf.flags.featureFactory);
    // System.err.println("Using " +
    // StringUtils.getShortClassName(crf.readerAndWriter));

    if (serializeTo != null) {

        //Wei Zhang: This is used.
        crf.serializeClassifier(serializeTo);
        /////////////////////////
    }

    if (serializeToText != null) {
        crf.serializeTextClassifier(serializeToText);
    }

    if (testFile != null) {
        DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter();
        if (crf.flags.searchGraphPrefix != null) {
            crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix,
                    crf.makeReaderAndWriter());
        } else if (crf.flags.printFirstOrderProbs) {
            crf.printFirstOrderProbs(testFile, readerAndWriter);
        } else if (crf.flags.printProbs) {
            crf.printProbs(testFile, readerAndWriter);
        } else if (crf.flags.useKBest) {
            int k = crf.flags.kBest;
            crf.classifyAndWriteAnswersKBest(testFile, k, readerAndWriter);
        } else if (crf.flags.printLabelValue) {
            crf.printLabelInformation(testFile, readerAndWriter);
        } else {
            crf.classifyAndWriteAnswers(testFile, readerAndWriter);
        }
    }

    if (testFiles != null) {
        List<File> files = new ArrayList<File>();
        for (String filename : testFiles.split(",")) {
            files.add(new File(filename));
        }
        crf.classifyAndWriteAnswers(files, crf.defaultReaderAndWriter());
    }

    if (textFile != null) {
        crf.classifyAndWriteAnswers(textFile);
    }

    if (textFiles != null) {
        List<File> files = new ArrayList<File>();
        for (String filename : textFiles.split(",")) {
            files.add(new File(filename));
        }
        crf.classifyAndWriteAnswers(files);
    }

    if (crf.flags.readStdin) {
        crf.classifyStdin();
    }
}

From source file:edu.illinois.cs.cogcomp.tokenizer.ChineseTokenizer.java

License:Open Source License

public ChineseTokenizer(String basedir) {
    Properties props = new Properties();
    props.setProperty("sighanCorporaDict", basedir);
    props.setProperty("serDictionary", basedir + "/dict-chris6.ser.gz");
    props.setProperty("inputEncoding", "UTF-8");
    props.setProperty("sighanPostProcessing", "true");
    segmenter = new CRFClassifier<>(props);
    segmenter.loadClassifierNoExceptions(basedir + "/ctb.gz", props);
    loadConversionMap();//  w  w w  .j av a 2s .co m
}

From source file:org.exist.xquery.corenlp.ChineseSegmenter.java

License:Open Source License

public ChineseSegmenter(Path dataDir) throws XPathException {
    // "ctb.gz"//  w  w  w . j a  v a2  s.  co m
    Properties props = new Properties();
    props.setProperty("NormalizationTable", new File(dataDir.toFile(), "norm.simp.utf8").getAbsolutePath());
    props.setProperty("normTableEncoding", "UTF-8");
    props.setProperty("sighanCorporaDict", dataDir.toAbsolutePath().toString());
    props.setProperty("sighanPostProcessing", "true");
    props.setProperty("serDictionary", new File(dataDir.toFile(), "dict-chris6.ser.gz").getAbsolutePath());

    classifier = new CRFClassifier(props);
    try {
        classifier.loadClassifier(new File(dataDir.toFile(), "ctb.gz"), props);
    } catch (IOException e) {
        throw new XPathException(e.getMessage());
    } catch (ClassNotFoundException e) {
        throw new XPathException(e.getMessage());
    } catch (Exception e) {
        throw new XPathException(e.getMessage());
    }
}

From source file:org.exist.xquery.corenlp.TrainClassifier.java

License:Open Source License

private void trainClassifier(Collection<List<CoreLabel>> documents, final InputDocType inputFormat) {
    final Properties props = new Properties();
    // fixme! - check ocrTrain configurable under other name?
    //props.setProperty("ocrTrain", "true");
    //props.setProperty("serializeTo", tempOutFile.toAbsolutePath().toString());
    props.setProperty("useClassFeature", "true");
    props.setProperty("useWord", "true");
    props.setProperty("useNGrams", "true");
    props.setProperty("noMidNGrams", "true");
    props.setProperty("useDisjunctive", "true");
    props.setProperty("maxNGramLeng", "6");
    props.setProperty("usePrev", "true");
    props.setProperty("useNext", "true");
    props.setProperty("useSequences", "true");
    props.setProperty("usePrevSequences", "true");
    props.setProperty("maxLeft", "1");
    props.setProperty("useTypeSeqs", "true");
    props.setProperty("useTypeSeqs2", "true");
    props.setProperty("useTypeySequences", "true");
    props.setProperty("wordShape", "chris2useLC");

    CRFClassifier<CoreLabel> classifier = new CRFClassifier(props);
    classifier.train(documents, new ColumnDocumentReaderAndWriter());
    classifier.serializeClassifier(tempOutFile.toAbsolutePath().toString());
}

From source file:org.jdmp.stanfordpos.StanfordTagger.java

License:Open Source License

public StanfordTagger(File file) throws Exception {
    crf = new CRFClassifier<CoreLabel>(new SeqClassifierFlags());
    crf.loadClassifierNoExceptions(file);
}

From source file:org.jdmp.stanfordpos.StanfordTagger.java

License:Open Source License

public void train(ListMatrix<ListMatrix<MapMatrix<String, String>>> listMatrix) throws Exception {
    List<List<CoreLabel>> sentenceList = new ArrayList<List<CoreLabel>>();
    for (ListMatrix<MapMatrix<String, String>> innerList : listMatrix) {
        List<CoreLabel> tokenList = new ArrayList<CoreLabel>();
        sentenceList.add(tokenList);/*  ww  w  . jav  a2s. c o m*/
        for (MapMatrix<String, String> mapMatrix : innerList) {
            CoreLabel l = new CoreLabel();
            l.set(CoreAnnotations.TextAnnotation.class, mapMatrix.getAsString("Token"));
            l.set(CoreAnnotations.AnswerAnnotation.class, mapMatrix.getAsString("Class"));
            tokenList.add(l);
        }
    }

    SeqClassifierFlags flags = new SeqClassifierFlags();
    flags.maxLeft = 3;
    flags.useClassFeature = true;
    flags.useWord = true;
    flags.maxNGramLeng = 6;
    flags.usePrev = true;
    flags.useNext = true;
    flags.useDisjunctive = true;
    flags.useSequences = true;
    flags.usePrevSequences = true;
    flags.useTypeSeqs = true;
    flags.useTypeSeqs2 = true;
    flags.useTypeySequences = true;
    flags.wordShape = WordShapeClassifier.WORDSHAPECHRIS2;

    flags.useNGrams = true;
    crf = new CRFClassifier<CoreLabel>(flags);
    crf.train(sentenceList, null);
}

From source file:org.knime.ext.textprocessing.language.chinese.nodes.tokenization.tokenizer.word.StanfordNlpChineseTokenizer.java

License:Open Source License

/**
 * Creates a new tokenizer.//w w  w  . j  ava  2 s .  com
 */
public StanfordNlpChineseTokenizer() {
    m_tokenizer = new CRFClassifier<>(PROPERTIES);
    m_tokenizer.loadClassifierNoExceptions(BASEDIR + "/ctb.gz", PROPERTIES);
}

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}//  w  w  w . j ava2 s.co  m
 */
@Override
protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception {
    // check input data
    assert ((data != null) && (data[0] != null));

    // get data table
    final BufferedDataTable docTable = (BufferedDataTable) data[0];

    // get dictionary as string and regex pattern
    final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]);
    final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()//
            .map(s -> Pattern.compile(s))//
            .collect(Collectors.toSet());

    // create tag for document tagger
    final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue());

    // create tagger based on known entities
    final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet,
            tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue());

    // create UUID to add them to the file path to avoid cases..
    // .. where two instances of the node model used the same file path at the same time
    final String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz";
    final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create files based on sentence list and known entities
    final File annotatedDocFile = new File(annotatedDocPath);
    int rowCounter = 0;
    int missingValueCounter = 0;
    try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) {
        final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue());
        // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation
        for (final DataRow row : docTable) {
            //set progress bar
            rowCounter++;
            final double progress = (rowCounter / (double) docTable.size()) / (2.0);
            exec.setProgress(progress, "Preparing documents");

            if (!row.getCell(colIndex).isMissing()
                    && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) {
                final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument();
                final Document taggedDoc = tagger.tag(doc);
                taggedDoc.sentenceIterator().forEachRemaining(
                        s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter));
            } else {
                missingValueCounter++;
            }
        }
    }

    // train model
    exec.setProgress(0.75, "Learning model.");

    final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath,
            m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(),
            m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(),
            m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(),
            m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(),
            m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(),
            m_wordShape.getStringValue()).getPropFile();
    final SeqClassifierFlags flags = new SeqClassifierFlags(props);
    final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags);
    crf.train();
    crf.serializeClassifier(modelPath);

    final File outputModel = new File(modelPath);
    final byte[] modelOutputBuffer = Files.toByteArray(outputModel);

    // delete temporary files
    java.nio.file.Files.delete(outputModel.toPath());
    java.nio.file.Files.delete(annotatedDocFile.toPath());

    // set warning messages if necessary
    if (knownEntitiesPatternSet.isEmpty()) {
        setWarningMessage("Trained model on empty dictionary.");
    } else if (rowCounter == 0) {
        setWarningMessage("Node created an empty model.");
    } else if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet,
            m_tokenizer.getStringValue()) };
}