Example usage for edu.stanford.nlp.ie AbstractSequenceClassifier makeObjectBankFromFile

List of usage examples for edu.stanford.nlp.ie AbstractSequenceClassifier makeObjectBankFromFile

Introduction

In this page you can find the example usage for edu.stanford.nlp.ie AbstractSequenceClassifier makeObjectBankFromFile.

Prototype

public ObjectBank<List<IN>> makeObjectBankFromFile(String filename,
            DocumentReaderAndWriter<IN> readerAndWriter) 

Source Link

Usage

From source file:lv.lumii.morphotagger.MorphoCRF.java

License:Open Source License

/**
 * @param args/*from w  ww  .  j  a v  a 2s.c o m*/
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws ClassCastException 
 */
public static void main(String[] args) throws IOException, ClassCastException, ClassNotFoundException {
    String trainfile = "MorphoCRF/train_dev.txt";
    String testfile = "MorphoCRF/test.txt";

    boolean train = false;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equalsIgnoreCase("-train")) {
            train = true;
        }
        if (args[i].equalsIgnoreCase("-dev")) {
            trainfile = "MorphoCRF/train.txt";
            testfile = "MorphoCRF/dev.txt";
        }
        if (args[i].equalsIgnoreCase("-production")) {
            trainfile = "MorphoCRF/all.txt";
            testfile = "MorphoCRF/test.txt";
        }
    }

    String pretrainedModel = "models/lv-morpho-model.ser.gz";
    String classifierOutput = "MorphoCRF/lv-morpho-model.ser.gz";

    //Properties props = StringUtils.propFileToProperties("/Users/pet/Documents/java/PaikensNER/MorfoCRF/lv-PP.prop");
    Properties props = new Properties();

    props.setProperty("useLVMorphoAnalyzer", "true");
    props.setProperty("LVMorphoAnalyzerTag", AttributeNames.i_PartOfSpeech);
    //props.setProperty("LVMorphoAnalyzerTag", AttributeNames.i_Case);
    props.setProperty("useLVMorphoAnalyzerPOS", "true");
    props.setProperty("useLVMorphoAnalyzerTag", "true");
    props.setProperty("useLVMorphoAnalyzerPrev", "true");
    props.setProperty("useLVMorphoAnalyzerNext", "true");
    props.setProperty("useLVMorphoAnalyzerItemIDs", "true");

    props.setProperty("saveFeatureIndexToDisk", "true");
    props.setProperty("maxLeft", "1");

    props.setProperty("useWord", "true");
    //props.setProperty("use2W", "true");
    //props.setProperty("usePrevSequences", "true");
    //props.setProperty("useClassFeature", "true");
    //props.setProperty("useTypeSeqs2", "true");
    //props.setProperty("useSequences", "true");
    props.setProperty("wordShape", "dan2useLC");
    //props.setProperty("useTypeySequences", "true");
    //props.setProperty("useDisjunctive", "true");      
    props.setProperty("noMidNGrams", "true");
    props.setProperty("maxNGramLeng", "6");
    props.setProperty("useNGrams", "true");
    //props.setProperty("usePrev", "true");
    //props.setProperty("useNext", "true");
    //props.setProperty("useTypeSeqs", "true");

    props.setProperty("readerAndWriter", "edu.stanford.nlp.sequences.LVMorphologyReaderAndWriter");
    props.setProperty("map", "word=0,answer=1,lemma=2");

    AbstractSequenceClassifier<CoreLabel> crf = new CMMClassifier<CoreLabel>(props);
    DocumentReaderAndWriter reader = crf.makeReaderAndWriter();
    if (train) {
        ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(trainfile, reader);
        crf.train(documents, reader); //atbilstoi props datiem

        crf.serializeClassifier(classifierOutput);
    } else {
        crf = CMMClassifier.getClassifier(pretrainedModel);
    }

    testData(crf, testfile, reader);
}

From source file:lv.lumii.morphotagger.MorphoCRF.java

License:Open Source License

private static void testData(AbstractSequenceClassifier<CoreLabel> crf, String filename,
        DocumentReaderAndWriter<CoreLabel> reader) {
    try {// ww w . j a  va 2 s  . co m
        PrintWriter izeja = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));

        ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(filename, reader);

        int correct_tag = 0;
        int correct_lemma = 0;
        int correct_all = 0;
        int total = 0;
        Collection<AttributeValues> errors = new LinkedList<AttributeValues>();

        for (List<CoreLabel> document : documents) {
            List<CoreLabel> out = crf.classify(document);

            System.out.println("-----");
            for (CoreLabel word : out) {
                String token = word.word();
                if (token.contains("<s>") || token.contains("</s>"))
                    continue;

                String answer = word.get(AnswerAnnotation.class);
                Word analysis = word.get(LVMorphologyAnalysis.class);
                Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); //complain about potential lemma errors
                String lemma = maxwf.getValue(AttributeNames.i_Lemma);

                String gold_tag = word.get(GoldAnswerAnnotation.class);
                String gold_lemma = word.get(LemmaAnnotation.class); // The lemma that's written in the test data

                AttributeValues gold_tags = MarkupConverter.fromKamolsMarkup(gold_tag);
                AttributeValues found_tags = MarkupConverter.fromKamolsMarkup(answer);
                errors.add(compareAVs(gold_tags, found_tags));

                total++;

                if (gold_lemma == null || gold_lemma.equalsIgnoreCase(lemma))
                    correct_lemma++;
                else {
                    //System.out.println(String.format("word: %s, tag:%s, gold_lemma: '%s', lemma: '%s'", token, answer, gold_lemma, lemma));
                }

                if (match(gold_tags, found_tags)) {
                    correct_tag++;
                    if (gold_lemma == null)
                        System.out.println("Nav lemmas? " + token);
                    if (gold_lemma != null && gold_lemma.equalsIgnoreCase(lemma))
                        correct_all++;
                } else {
                    System.out.println(
                            "v?rds: " + token + ", pareizais: " + gold_tag + ", autom?tiskais: " + answer);
                    //compareAVs(pareizie, atrastie).describe(new PrintWriter(System.out));
                }
            }
        }

        izeja.printf("\nEvaluation results:\n");
        izeja.printf("\tCorrect tag:\t%4.1f%%\t%d\n", correct_tag * 100.0 / total, total - correct_tag);
        izeja.printf("\tCorrect lemma:\t%4.1f%%\t%d\n", correct_lemma * 100.0 / total, total - correct_lemma);
        izeja.printf("\tCorrect all:\t%4.1f%%\t%d\n", correct_all * 100.0 / total, total - correct_all);
        summarizeErrors(errors, izeja);
        izeja.flush();
    } catch (IOException e) {
        e.printStackTrace();
    }
}