List of usage examples for edu.stanford.nlp.ie.crf CRFClassifier CRFClassifier
public CRFClassifier(CRFClassifier<IN> crf)
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordNamedEntityRecognizerTrainer.java
License:Open Source License
@Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (out != null) { IOUtils.closeQuietly(out);//from w ww .ja va2s . com } // Load user-provided configuration Properties props = new Properties(); try (InputStream is = new FileInputStream(propertiesFile)) { props.load(is); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // Add/replace training file information props.setProperty("serializeTo", targetLocation.getAbsolutePath()); // set training data info props.setProperty("trainFile", tempData.getAbsolutePath()); props.setProperty("map", "word=0,answer=1"); SeqClassifierFlags flags = new SeqClassifierFlags(props); // label set flags.entitySubclassification = entitySubClassification; // if representation should be kept flags.retainEntitySubclassification = retainClassification; // need to use this reader because the other ones don't recognize the previous settings // about the label set flags.readerAndWriter = "edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter"; // Train CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags); getLogger().info("Starting to train..."); crf.train(); try { getLogger().info(String.format("Serializing classifier to target location: %s", targetLocation.getCanonicalPath())); crf.serializeClassifier(targetLocation.getAbsolutePath()); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } }
From source file:edu.albany.cubism.util.Segmenter.java
public Segmenter() { System.out.println("basedir: " + basedir); Properties props = new Properties(); props.setProperty("sighanCorporaDict", basedir); // props.setProperty("NormalizationTable", "data/norm.simp.utf8"); // props.setProperty("normTableEncoding", "UTF-8"); // below is needed because CTBSegDocumentIteratorFactory accesses it props.setProperty("serDictionary", basedir + "/dict-chris6.ser.gz"); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); this.segmenter = new CRFClassifier<CoreLabel>(props); segmenter.loadClassifierNoExceptions(basedir + "/ctb.gz", props); }
From source file:edu.cmu.geolocator.nlp.ner.StanfordCRF.CRF_Learn.java
License:Apache License
public static void main(String[] argc) throws Exception { //StringUtils.printErrInvocationString("CRFClassifier", args); String[] args = new String[2]; args[0] = "-prop"; args[1] = "src/edu/cmu/geoparser/nlptools/ner/StanfordCRF/train.prop"; Properties props = StringUtils.argsToProperties(args); CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props); String testFile = crf.flags.testFile; String testFiles = crf.flags.testFiles; String textFile = crf.flags.textFile; String textFiles = crf.flags.textFiles; String loadPath = crf.flags.loadClassifier; String loadTextPath = crf.flags.loadTextClassifier; String serializeTo = crf.flags.serializeTo; String serializeToText = crf.flags.serializeToText; if (loadPath != null) { crf.loadClassifierNoExceptions(loadPath, props); } else if (loadTextPath != null) { System.err.println("Warning: this is now only tested for Chinese Segmenter"); System.err.println("(Sun Dec 23 00:59:39 2007) (pichuan)"); try {/*from w w w .j ava 2 s.com*/ crf.loadTextClassifier(loadTextPath, props); // System.err.println("DEBUG: out from crf.loadTextClassifier"); } catch (Exception e) { throw new RuntimeException("error loading " + loadTextPath, e); } } else if (crf.flags.loadJarClassifier != null) { crf.loadJarClassifier(crf.flags.loadJarClassifier, props); } else if (crf.flags.trainFile != null || crf.flags.trainFileList != null) { //Wei Zhang: This is where the program starts to train. crf.train(); //////////// } else { crf.loadDefaultClassifier(); } // System.err.println("Using " + crf.flags.featureFactory); // System.err.println("Using " + // StringUtils.getShortClassName(crf.readerAndWriter)); if (serializeTo != null) { //Wei Zhang: This is used. crf.serializeClassifier(serializeTo); ///////////////////////// } if (serializeToText != null) { crf.serializeTextClassifier(serializeToText); } if (testFile != null) { DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter(); if (crf.flags.searchGraphPrefix != null) { crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, crf.makeReaderAndWriter()); } else if (crf.flags.printFirstOrderProbs) { crf.printFirstOrderProbs(testFile, readerAndWriter); } else if (crf.flags.printProbs) { crf.printProbs(testFile, readerAndWriter); } else if (crf.flags.useKBest) { int k = crf.flags.kBest; crf.classifyAndWriteAnswersKBest(testFile, k, readerAndWriter); } else if (crf.flags.printLabelValue) { crf.printLabelInformation(testFile, readerAndWriter); } else { crf.classifyAndWriteAnswers(testFile, readerAndWriter); } } if (testFiles != null) { List<File> files = new ArrayList<File>(); for (String filename : testFiles.split(",")) { files.add(new File(filename)); } crf.classifyAndWriteAnswers(files, crf.defaultReaderAndWriter()); } if (textFile != null) { crf.classifyAndWriteAnswers(textFile); } if (textFiles != null) { List<File> files = new ArrayList<File>(); for (String filename : textFiles.split(",")) { files.add(new File(filename)); } crf.classifyAndWriteAnswers(files); } if (crf.flags.readStdin) { crf.classifyStdin(); } }
From source file:edu.illinois.cs.cogcomp.tokenizer.ChineseTokenizer.java
License:Open Source License
public ChineseTokenizer(String basedir) { Properties props = new Properties(); props.setProperty("sighanCorporaDict", basedir); props.setProperty("serDictionary", basedir + "/dict-chris6.ser.gz"); props.setProperty("inputEncoding", "UTF-8"); props.setProperty("sighanPostProcessing", "true"); segmenter = new CRFClassifier<>(props); segmenter.loadClassifierNoExceptions(basedir + "/ctb.gz", props); loadConversionMap();// w w w .j av a 2s .co m }
From source file:org.exist.xquery.corenlp.ChineseSegmenter.java
License:Open Source License
public ChineseSegmenter(Path dataDir) throws XPathException { // "ctb.gz"// w w w . j a v a2 s. co m Properties props = new Properties(); props.setProperty("NormalizationTable", new File(dataDir.toFile(), "norm.simp.utf8").getAbsolutePath()); props.setProperty("normTableEncoding", "UTF-8"); props.setProperty("sighanCorporaDict", dataDir.toAbsolutePath().toString()); props.setProperty("sighanPostProcessing", "true"); props.setProperty("serDictionary", new File(dataDir.toFile(), "dict-chris6.ser.gz").getAbsolutePath()); classifier = new CRFClassifier(props); try { classifier.loadClassifier(new File(dataDir.toFile(), "ctb.gz"), props); } catch (IOException e) { throw new XPathException(e.getMessage()); } catch (ClassNotFoundException e) { throw new XPathException(e.getMessage()); } catch (Exception e) { throw new XPathException(e.getMessage()); } }
From source file:org.exist.xquery.corenlp.TrainClassifier.java
License:Open Source License
private void trainClassifier(Collection<List<CoreLabel>> documents, final InputDocType inputFormat) { final Properties props = new Properties(); // fixme! - check ocrTrain configurable under other name? //props.setProperty("ocrTrain", "true"); //props.setProperty("serializeTo", tempOutFile.toAbsolutePath().toString()); props.setProperty("useClassFeature", "true"); props.setProperty("useWord", "true"); props.setProperty("useNGrams", "true"); props.setProperty("noMidNGrams", "true"); props.setProperty("useDisjunctive", "true"); props.setProperty("maxNGramLeng", "6"); props.setProperty("usePrev", "true"); props.setProperty("useNext", "true"); props.setProperty("useSequences", "true"); props.setProperty("usePrevSequences", "true"); props.setProperty("maxLeft", "1"); props.setProperty("useTypeSeqs", "true"); props.setProperty("useTypeSeqs2", "true"); props.setProperty("useTypeySequences", "true"); props.setProperty("wordShape", "chris2useLC"); CRFClassifier<CoreLabel> classifier = new CRFClassifier(props); classifier.train(documents, new ColumnDocumentReaderAndWriter()); classifier.serializeClassifier(tempOutFile.toAbsolutePath().toString()); }
From source file:org.jdmp.stanfordpos.StanfordTagger.java
License:Open Source License
public StanfordTagger(File file) throws Exception { crf = new CRFClassifier<CoreLabel>(new SeqClassifierFlags()); crf.loadClassifierNoExceptions(file); }
From source file:org.jdmp.stanfordpos.StanfordTagger.java
License:Open Source License
public void train(ListMatrix<ListMatrix<MapMatrix<String, String>>> listMatrix) throws Exception { List<List<CoreLabel>> sentenceList = new ArrayList<List<CoreLabel>>(); for (ListMatrix<MapMatrix<String, String>> innerList : listMatrix) { List<CoreLabel> tokenList = new ArrayList<CoreLabel>(); sentenceList.add(tokenList);/* ww w . jav a2s. c o m*/ for (MapMatrix<String, String> mapMatrix : innerList) { CoreLabel l = new CoreLabel(); l.set(CoreAnnotations.TextAnnotation.class, mapMatrix.getAsString("Token")); l.set(CoreAnnotations.AnswerAnnotation.class, mapMatrix.getAsString("Class")); tokenList.add(l); } } SeqClassifierFlags flags = new SeqClassifierFlags(); flags.maxLeft = 3; flags.useClassFeature = true; flags.useWord = true; flags.maxNGramLeng = 6; flags.usePrev = true; flags.useNext = true; flags.useDisjunctive = true; flags.useSequences = true; flags.usePrevSequences = true; flags.useTypeSeqs = true; flags.useTypeSeqs2 = true; flags.useTypeySequences = true; flags.wordShape = WordShapeClassifier.WORDSHAPECHRIS2; flags.useNGrams = true; crf = new CRFClassifier<CoreLabel>(flags); crf.train(sentenceList, null); }
From source file:org.knime.ext.textprocessing.language.chinese.nodes.tokenization.tokenizer.word.StanfordNlpChineseTokenizer.java
License:Open Source License
/** * Creates a new tokenizer.//w w w . j ava 2 s . com */ public StanfordNlpChineseTokenizer() { m_tokenizer = new CRFClassifier<>(PROPERTIES); m_tokenizer.loadClassifierNoExceptions(BASEDIR + "/ctb.gz", PROPERTIES); }
From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnelearner.StanfordNlpNeLearnerNodeModel.java
License:Open Source License
/** * {@inheritDoc}// w w w . j ava2 s.co m */ @Override protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception { // check input data assert ((data != null) && (data[0] != null)); // get data table final BufferedDataTable docTable = (BufferedDataTable) data[0]; // get dictionary as string and regex pattern final Set<String> knownEntitiesStringSet = getDictionary((BufferedDataTable) data[1]); final Set<Pattern> knownEntitiesPatternSet = knownEntitiesStringSet.stream()// .map(s -> Pattern.compile(s))// .collect(Collectors.toSet()); // create tag for document tagger final Tag tag = new Tag(m_tagValueModel.getStringValue(), m_tagTypeModel.getStringValue()); // create tagger based on known entities final MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, tag, m_caseSensitivity.getBooleanValue(), m_tokenizer.getStringValue()); // create UUID to add them to the file path to avoid cases.. // .. where two instances of the node model used the same file path at the same time final String tempDir = KNIMEConstants.getKNIMETempDir() + "/"; final String modelPath = tempDir + "oM-" + UUID.randomUUID().toString() + ".crf.ser.gz"; final String annotatedDocPath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv"; // create files based on sentence list and known entities final File annotatedDocFile = new File(annotatedDocPath); int rowCounter = 0; int missingValueCounter = 0; try (final PrintWriter sentenceFileWriter = new PrintWriter(annotatedDocFile, "UTF-8")) { final int colIndex = docTable.getDataTableSpec().findColumnIndex(m_docColumnModel.getStringValue()); // tag documents and transform sentences to strings while tagged terms get stanfordnlp annotation for (final DataRow row : docTable) { //set progress bar rowCounter++; final double progress = (rowCounter / (double) docTable.size()) / (2.0); exec.setProgress(progress, "Preparing documents"); if (!row.getCell(colIndex).isMissing() && row.getCell(colIndex).getType().isCompatible(DocumentValue.class)) { final Document doc = ((DocumentValue) row.getCell(colIndex)).getDocument(); final Document taggedDoc = tagger.tag(doc); taggedDoc.sentenceIterator().forEachRemaining( s -> writeAnnotationData(s, knownEntitiesStringSet, sentenceFileWriter)); } else { missingValueCounter++; } } } // train model exec.setProgress(0.75, "Learning model."); final Properties props = new StanfordNlpNeLearnerPropFileGenerator(annotatedDocPath, m_useClassFeature.getBooleanValue(), m_useWord.getBooleanValue(), m_useNGrams.getBooleanValue(), m_noMidNGrams.getBooleanValue(), m_maxNGramLeng.getIntValue(), m_usePrev.getBooleanValue(), m_useNext.getBooleanValue(), m_useDisjunctive.getBooleanValue(), m_useSequences.getBooleanValue(), m_usePrevSequences.getBooleanValue(), m_maxLeft.getIntValue(), m_useTypeSeqs.getBooleanValue(), m_useTypeSeqs2.getBooleanValue(), m_useTypeySequences.getBooleanValue(), m_wordShape.getStringValue()).getPropFile(); final SeqClassifierFlags flags = new SeqClassifierFlags(props); final CRFClassifier<CoreLabel> crf = new CRFClassifier<>(flags); crf.train(); crf.serializeClassifier(modelPath); final File outputModel = new File(modelPath); final byte[] modelOutputBuffer = Files.toByteArray(outputModel); // delete temporary files java.nio.file.Files.delete(outputModel.toPath()); java.nio.file.Files.delete(annotatedDocFile.toPath()); // set warning messages if necessary if (knownEntitiesPatternSet.isEmpty()) { setWarningMessage("Trained model on empty dictionary."); } else if (rowCounter == 0) { setWarningMessage("Node created an empty model."); } else if (missingValueCounter == 1) { setWarningMessage(missingValueCounter + " row has been ignored due to missing value."); } else if (missingValueCounter > 1) { setWarningMessage(missingValueCounter + " rows have been ignored due to missing values."); } return new PortObject[] { new StanfordNERModelPortObject(modelOutputBuffer, tag, knownEntitiesStringSet, m_tokenizer.getStringValue()) }; }