List of usage examples for edu.stanford.nlp.ie AbstractSequenceClassifier makeObjectBankFromFile
public ObjectBank<List<IN>> makeObjectBankFromFile(String filename,
DocumentReaderAndWriter<IN> readerAndWriter)
From source file:lv.lumii.morphotagger.MorphoCRF.java
License:Open Source License
/** * @param args/*from w ww . j a v a 2s.c o m*/ * @throws IOException * @throws ClassNotFoundException * @throws ClassCastException */ public static void main(String[] args) throws IOException, ClassCastException, ClassNotFoundException { String trainfile = "MorphoCRF/train_dev.txt"; String testfile = "MorphoCRF/test.txt"; boolean train = false; for (int i = 0; i < args.length; i++) { if (args[i].equalsIgnoreCase("-train")) { train = true; } if (args[i].equalsIgnoreCase("-dev")) { trainfile = "MorphoCRF/train.txt"; testfile = "MorphoCRF/dev.txt"; } if (args[i].equalsIgnoreCase("-production")) { trainfile = "MorphoCRF/all.txt"; testfile = "MorphoCRF/test.txt"; } } String pretrainedModel = "models/lv-morpho-model.ser.gz"; String classifierOutput = "MorphoCRF/lv-morpho-model.ser.gz"; //Properties props = StringUtils.propFileToProperties("/Users/pet/Documents/java/PaikensNER/MorfoCRF/lv-PP.prop"); Properties props = new Properties(); props.setProperty("useLVMorphoAnalyzer", "true"); props.setProperty("LVMorphoAnalyzerTag", AttributeNames.i_PartOfSpeech); //props.setProperty("LVMorphoAnalyzerTag", AttributeNames.i_Case); props.setProperty("useLVMorphoAnalyzerPOS", "true"); props.setProperty("useLVMorphoAnalyzerTag", "true"); props.setProperty("useLVMorphoAnalyzerPrev", "true"); props.setProperty("useLVMorphoAnalyzerNext", "true"); props.setProperty("useLVMorphoAnalyzerItemIDs", "true"); props.setProperty("saveFeatureIndexToDisk", "true"); props.setProperty("maxLeft", "1"); props.setProperty("useWord", "true"); //props.setProperty("use2W", "true"); //props.setProperty("usePrevSequences", "true"); //props.setProperty("useClassFeature", "true"); //props.setProperty("useTypeSeqs2", "true"); //props.setProperty("useSequences", "true"); props.setProperty("wordShape", "dan2useLC"); //props.setProperty("useTypeySequences", "true"); //props.setProperty("useDisjunctive", "true"); props.setProperty("noMidNGrams", "true"); props.setProperty("maxNGramLeng", "6"); props.setProperty("useNGrams", "true"); //props.setProperty("usePrev", "true"); //props.setProperty("useNext", "true"); //props.setProperty("useTypeSeqs", "true"); props.setProperty("readerAndWriter", "edu.stanford.nlp.sequences.LVMorphologyReaderAndWriter"); props.setProperty("map", "word=0,answer=1,lemma=2"); AbstractSequenceClassifier<CoreLabel> crf = new CMMClassifier<CoreLabel>(props); DocumentReaderAndWriter reader = crf.makeReaderAndWriter(); if (train) { ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(trainfile, reader); crf.train(documents, reader); //atbilstoi props datiem crf.serializeClassifier(classifierOutput); } else { crf = CMMClassifier.getClassifier(pretrainedModel); } testData(crf, testfile, reader); }
From source file:lv.lumii.morphotagger.MorphoCRF.java
License:Open Source License
private static void testData(AbstractSequenceClassifier<CoreLabel> crf, String filename, DocumentReaderAndWriter<CoreLabel> reader) { try {// ww w . j a va 2 s . co m PrintWriter izeja = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(filename, reader); int correct_tag = 0; int correct_lemma = 0; int correct_all = 0; int total = 0; Collection<AttributeValues> errors = new LinkedList<AttributeValues>(); for (List<CoreLabel> document : documents) { List<CoreLabel> out = crf.classify(document); System.out.println("-----"); for (CoreLabel word : out) { String token = word.word(); if (token.contains("<s>") || token.contains("</s>")) continue; String answer = word.get(AnswerAnnotation.class); Word analysis = word.get(LVMorphologyAnalysis.class); Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); //complain about potential lemma errors String lemma = maxwf.getValue(AttributeNames.i_Lemma); String gold_tag = word.get(GoldAnswerAnnotation.class); String gold_lemma = word.get(LemmaAnnotation.class); // The lemma that's written in the test data AttributeValues gold_tags = MarkupConverter.fromKamolsMarkup(gold_tag); AttributeValues found_tags = MarkupConverter.fromKamolsMarkup(answer); errors.add(compareAVs(gold_tags, found_tags)); total++; if (gold_lemma == null || gold_lemma.equalsIgnoreCase(lemma)) correct_lemma++; else { //System.out.println(String.format("word: %s, tag:%s, gold_lemma: '%s', lemma: '%s'", token, answer, gold_lemma, lemma)); } if (match(gold_tags, found_tags)) { correct_tag++; if (gold_lemma == null) System.out.println("Nav lemmas? " + token); if (gold_lemma != null && gold_lemma.equalsIgnoreCase(lemma)) correct_all++; } else { System.out.println( "v?rds: " + token + ", pareizais: " + gold_tag + ", autom?tiskais: " + answer); //compareAVs(pareizie, atrastie).describe(new PrintWriter(System.out)); } } } izeja.printf("\nEvaluation results:\n"); izeja.printf("\tCorrect tag:\t%4.1f%%\t%d\n", correct_tag * 100.0 / total, total - correct_tag); izeja.printf("\tCorrect lemma:\t%4.1f%%\t%d\n", correct_lemma * 100.0 / total, total - correct_lemma); izeja.printf("\tCorrect all:\t%4.1f%%\t%d\n", correct_all * 100.0 / total, total - correct_all); summarizeErrors(errors, izeja); izeja.flush(); } catch (IOException e) { e.printStackTrace(); } }