Example usage for edu.stanford.nlp.stats ClassicCounter ClassicCounter

List of usage examples for edu.stanford.nlp.stats ClassicCounter ClassicCounter

Introduction

In this page you can find the example usage for edu.stanford.nlp.stats ClassicCounter ClassicCounter.

Prototype

public ClassicCounter() 

Source Link

Document

Constructs a new (empty) Counter backed by a HashMap.

Usage

From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java

License:Open Source License

/**
 * Reads in a model file in svm light format.  It needs to know if its multiclass or not
 * because it affects the number of header lines.  Maybe there is another way to tell and we
 * can remove this flag?/*from  w  w  w.  ja  v a2s .  c  o  m*/
 */
private static Pair<Double, ClassicCounter<Integer>> readModel(File modelFile, boolean multiclass) {
    int modelLineCount = 0;
    try {

        int numLinesToSkip = multiclass ? 13 : 10;
        String stopToken = "#";

        BufferedReader in = new BufferedReader(new FileReader(modelFile));

        for (int i = 0; i < numLinesToSkip; i++) {
            in.readLine();
            modelLineCount++;
        }

        List<Pair<Double, ClassicCounter<Integer>>> supportVectors = new ArrayList<Pair<Double, ClassicCounter<Integer>>>();
        // Read Threshold
        String thresholdLine = in.readLine();
        modelLineCount++;
        String[] pieces = thresholdLine.split("\\s+");
        double threshold = Double.parseDouble(pieces[0]);
        // Read Support Vectors
        while (in.ready()) {
            String svLine = in.readLine();
            modelLineCount++;
            pieces = svLine.split("\\s+");
            // First Element is the alpha_i * y_i
            double alpha = Double.parseDouble(pieces[0]);
            ClassicCounter<Integer> supportVector = new ClassicCounter<Integer>();
            for (int i = 1; i < pieces.length; ++i) {
                String piece = pieces[i];
                if (piece.equals(stopToken))
                    break;
                // Each in featureIndex:num class
                String[] indexNum = piece.split(":");
                String featureIndex = indexNum[0];
                // mihai: we may see "qid" as indexNum[0]. just skip this piece. this is the block id useful only for reranking, which we don't do here.
                if (!featureIndex.equals("qid")) {
                    double count = Double.parseDouble(indexNum[1]);
                    supportVector.incrementCount(Integer.valueOf(featureIndex), count);
                }
            }
            supportVectors.add(new Pair<Double, ClassicCounter<Integer>>(alpha, supportVector));
        }

        in.close();

        return new Pair<Double, ClassicCounter<Integer>>(threshold, getWeights(supportVectors));
    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Error reading SVM model (line " + modelLineCount + " in file "
                + modelFile.getAbsolutePath() + ")");
    }
}

From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java

License:Open Source License

/**
 * Takes all the support vectors, and their corresponding alphas, and computes a weight
 * vector that can be used in a vanilla LinearClassifier.  This only works because
 * we are using a linear kernel.  The Counter is over the feature indices (+1 cos for
 * some reason svm_light is 1-indexed), not features.
 *///www.ja  va 2  s .com
private static ClassicCounter<Integer> getWeights(List<Pair<Double, ClassicCounter<Integer>>> supportVectors) {
    ClassicCounter<Integer> weights = new ClassicCounter<Integer>();
    for (Pair<Double, ClassicCounter<Integer>> sv : supportVectors) {
        ClassicCounter<Integer> c = new ClassicCounter<Integer>(sv.second());
        Counters.multiplyInPlace(c, sv.first());
        Counters.addInPlace(weights, c);
    }
    return weights;
}

From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java

License:Open Source License

/**
 * Converts the svm_light weight Counter (which uses feature indices) into a weight Counter
 * using the actual features and labels.  Because this is svm_light, and not svm_struct, the
 * weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class
 * (which correspond to labelIndex.get(1)) are just the negation of one another.
 *///  w  w  w .  j a  va2  s  .  co  m
private ClassicCounter<Pair<F, L>> convertSVMLightWeights(ClassicCounter<Integer> weights,
        Index<F> featureIndex, Index<L> labelIndex) {
    ClassicCounter<Pair<F, L>> newWeights = new ClassicCounter<Pair<F, L>>();
    for (int i : weights.keySet()) {
        F f = featureIndex.get(i - 1);
        double w = weights.getCount(i);
        // the first guy in the labelIndex was the +1 class and the second guy
        // was the -1 class
        newWeights.incrementCount(new Pair<F, L>(f, labelIndex.get(0)), w);
        newWeights.incrementCount(new Pair<F, L>(f, labelIndex.get(1)), -w);
    }
    return newWeights;
}

From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java

License:Open Source License

/**
 * Converts the svm_struct weight Counter (in which the weight for a feature/label pair
 * correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter
 * using the actual features and labels.
 *///from  w w w. j  a  v  a 2s.  c  o m
private ClassicCounter<Pair<F, L>> convertSVMStructWeights(ClassicCounter<Integer> weights,
        Index<F> featureIndex, Index<L> labelIndex) {
    // int numLabels = labelIndex.size();
    int numFeatures = featureIndex.size();
    ClassicCounter<Pair<F, L>> newWeights = new ClassicCounter<Pair<F, L>>();
    for (int i : weights.keySet()) {
        L l = labelIndex.get((i - 1) / numFeatures); // integer division on purpose
        F f = featureIndex.get((i - 1) % numFeatures);
        double w = weights.getCount(i);
        newWeights.incrementCount(new Pair<F, L>(f, l), w);
    }

    return newWeights;
}

From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java

License:Open Source License

public SVMLightClassifier<L, F> trainClassifierBasic(GeneralDataset<L, F> dataset) {
    Index<L> labelIndex = dataset.labelIndex();
    Index<F> featureIndex = dataset.featureIndex;
    boolean multiclass = (dataset.numClasses() > 2);
    try {//from  w w  w.  j  a v  a2 s. com

        // this is the file that the model will be saved to
        File modelFile = File.createTempFile("svm-", ".model");
        if (deleteTempFilesOnExit) {
            modelFile.deleteOnExit();
        }

        // this is the file that the svm light formated dataset
        // will be printed to
        File dataFile = File.createTempFile("svm-", ".data");
        if (deleteTempFilesOnExit) {
            dataFile.deleteOnExit();
        }

        // print the dataset
        PrintWriter pw = new PrintWriter(new FileWriter(dataFile));
        dataset.printSVMLightFormat(pw);
        pw.close();

        // -v 0 makes it not verbose
        // -m 400 gives it a larger cache, for faster training
        String cmd = (multiclass ? svmStructLearn : (useSVMPerf ? svmPerfLearn : svmLightLearn)) + " -v "
                + svmLightVerbosity + " -m 5000 -w 3 -t 0 -g 7 ";

        // set the value of C if we have one specified
        if (C > 0.0)
            cmd = cmd + " -c " + C + " "; // C value
        else if (useSVMPerf)
            cmd = cmd + " -c " + 0.01 + " "; //It's required to specify this parameter for SVM perf

        // Alpha File
        if (useAlphaFile) {
            File newAlphaFile = File.createTempFile("svm-", ".alphas");
            if (deleteTempFilesOnExit) {
                newAlphaFile.deleteOnExit();
            }
            cmd = cmd + " -a " + newAlphaFile.getAbsolutePath();
            if (alphaFile != null) {
                cmd = cmd + " -y " + alphaFile.getAbsolutePath();
            }
            alphaFile = newAlphaFile;
        }

        // File and Model Data
        cmd = cmd + " " + dataFile.getAbsolutePath() + " " + modelFile.getAbsolutePath();

        if (verbose)
            System.err.println("<< " + cmd + " >>");

        /*Process p = Runtime.getRuntime().exec(cmd);
                
        p.waitFor();
                
        if (p.exitValue() != 0) throw new RuntimeException("Error Training SVM Light exit value: " + p.exitValue());
        p.destroy();   */
        SystemUtils.run(new ProcessBuilder(whitespacePattern.split(cmd)), new PrintWriter(System.err),
                new PrintWriter(System.err));

        if (doEval) {
            File predictFile = File.createTempFile("svm-", ".pred");
            if (deleteTempFilesOnExit) {
                predictFile.deleteOnExit();
            }
            String evalCmd = (multiclass ? svmStructClassify
                    : (useSVMPerf ? svmPerfClassify : svmLightClassify)) + " " + dataFile.getAbsolutePath()
                    + " " + modelFile.getAbsolutePath() + " " + predictFile.getAbsolutePath();
            if (verbose)
                System.err.println("<< " + evalCmd + " >>");
            SystemUtils.run(new ProcessBuilder(whitespacePattern.split(evalCmd)), new PrintWriter(System.err),
                    new PrintWriter(System.err));
        }
        // read in the model file
        Pair<Double, ClassicCounter<Integer>> weightsAndThresh = readModel(modelFile, multiclass);
        double threshold = weightsAndThresh.first();
        ClassicCounter<Pair<F, L>> weights = convertWeights(weightsAndThresh.second(), featureIndex, labelIndex,
                multiclass);
        ClassicCounter<L> thresholds = new ClassicCounter<L>();
        if (!multiclass) {
            thresholds.setCount(labelIndex.get(0), -threshold);
            thresholds.setCount(labelIndex.get(1), threshold);
        }
        SVMLightClassifier<L, F> classifier = new SVMLightClassifier<L, F>(weights, thresholds);
        if (doEval) {
            File predictFile = File.createTempFile("svm-", ".pred2");
            if (deleteTempFilesOnExit) {
                predictFile.deleteOnExit();
            }
            PrintWriter pw2 = new PrintWriter(predictFile);
            NumberFormat nf = NumberFormat.getNumberInstance();
            nf.setMaximumFractionDigits(5);
            for (Datum<L, F> datum : dataset) {
                Counter<L> scores = classifier.scoresOf(datum);
                pw2.println(Counters.toString(scores, nf));
            }
            pw2.close();
        }

        if (useSigmoid) {
            if (verbose)
                System.out.print("fitting sigmoid...");
            classifier.setPlatt(fitSigmoid(classifier, dataset));
            if (verbose)
                System.out.println("done");
        }

        return classifier;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:gr.aueb.cs.nlp.wordtagger.data.structure.WordSet.java

License:Open Source License

/**
 * Converts any List with words to a Stanford set;
 * @param words/*from  w w w .j  a va  2 s.  c o  m*/
 * @return, a list of real valued datums
 */
public static List<RVFDatum<String, String>> toStanfordSet(List<Word> words) {
    List<RVFDatum<String, String>> trainignData = new ArrayList<>();
    for (Word w : words) {
        List<Double> feats = Arrays.asList(ArrayUtils.toObject(w.getFeatureVec().getValues()));
        ClassicCounter<String> cc = new ClassicCounter<>();
        for (int i = 0; i < feats.size(); i++) {
            cc.incrementCount("feature" + i, feats.get(i));
        }
        if (w.getCategory() != null) {
            RVFDatum<String, String> dtm = new RVFDatum<>(cc, w.getCategory());
            trainignData.add(dtm);
        }
    }
    System.out.println("Converted List to classifier trainset");
    return trainignData;
}

From source file:gr.aueb.cs.nlp.wordtagger.data.structure.WordSet.java

License:Open Source License

/**
 * convers a word to a stanforf real valued atum
 * @param w//from w ww .ja  v  a  2  s . c  om
 * @return
 */
public static RVFDatum<String, String> word2Datum(Word w) {
    List<Double> feats = Arrays.asList(ArrayUtils.toObject(w.getFeatureVec().getValues()));
    ClassicCounter<String> cc = new ClassicCounter<>();
    for (int i = 0; i < feats.size(); i++) {
        cc.incrementCount("feature" + i, feats.get(i));
    }
    return new RVFDatum<>(cc, w.getCategory());
}

From source file:knu.univ.lingvo.coref.ACEMentionExtractor.java

License:Open Source License

private static void printRawDoc(List<CoreMap> sentences, List<List<Mention>> allMentions, String filename,
        boolean gold) throws FileNotFoundException {
    StringBuilder doc = new StringBuilder();
    int previousOffset = 0;
    Counter<Integer> mentionCount = new ClassicCounter<Integer>();
    for (List<Mention> l : allMentions) {
        for (Mention m : l) {
            mentionCount.incrementCount(m.goldCorefClusterID);
        }//from w  ww.ja v a 2  s . c o m
    }

    for (int i = 0; i < sentences.size(); i++) {
        CoreMap sentence = sentences.get(i);
        List<Mention> mentions = allMentions.get(i);

        String[] tokens = sentence.get(CoreAnnotations.TextAnnotation.class).split(" ");
        String sent = "";
        List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
        if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class))
            sent += "\n";
        previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        Counter<Integer> startCounts = new ClassicCounter<Integer>();
        Counter<Integer> endCounts = new ClassicCounter<Integer>();
        Map<Integer, Set<Integer>> endID = Generics.newHashMap();
        for (Mention m : mentions) {
            startCounts.incrementCount(m.startIndex);
            endCounts.incrementCount(m.endIndex);
            if (!endID.containsKey(m.endIndex))
                endID.put(m.endIndex, Generics.<Integer>newHashSet());
            endID.get(m.endIndex).add(m.goldCorefClusterID);
        }
        for (int j = 0; j < tokens.length; j++) {
            if (endID.containsKey(j)) {
                for (Integer id : endID.get(j)) {
                    if (mentionCount.getCount(id) != 1 && gold)
                        sent += "]_" + id;
                    else
                        sent += "]";
                }
            }
            for (int k = 0; k < startCounts.getCount(j); k++) {
                if (!sent.endsWith("["))
                    sent += " ";
                sent += "[";
            }
            sent += " ";
            sent = sent + tokens[j];
        }
        for (int k = 0; k < endCounts.getCount(tokens.length); k++) {
            sent += "]";
        }
        sent += "\n";
        doc.append(sent);
    }
    if (gold)
        logger.fine("New DOC: (GOLD MENTIONS) ==================================================");
    else
        logger.fine("New DOC: (Predicted Mentions) ==================================================");
    logger.fine(doc.toString());
}

From source file:knu.univ.lingvo.coref.SieveCoreferenceSystem.java

License:Open Source License

/**
 * print a coref link information including context and parse tree
 *///from  w  w  w .  j a v a 2 s  .  c o m
private static void printLinkWithContext(Logger logger, String header, IntTuple src, IntTuple dst,
        Document document, Semantics semantics) {
    List<List<Mention>> orderedMentionsBySentence = document.getOrderedMentions();
    List<List<Mention>> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence;

    Mention srcMention = orderedMentionsBySentence.get(src.get(0)).get(src.get(1));
    Mention dstMention = orderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
    List<CoreLabel> srcSentence = srcMention.sentenceWords;
    List<CoreLabel> dstSentence = dstMention.sentenceWords;

    printLink(logger, header, src, dst, orderedMentionsBySentence);

    printList(logger, "Mention:" + srcMention.spanToString(), "Gender:" + srcMention.gender.toString(),
            "Number:" + srcMention.number.toString(), "Animacy:" + srcMention.animacy.toString(),
            "Person:" + srcMention.person.toString(), "NER:" + srcMention.nerString,
            "Head:" + srcMention.headString, "Type:" + srcMention.mentionType.toString(),
            "utter: " + srcMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class),
            "speakerID: " + srcMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class),
            "twinless:" + srcMention.twinless);
    logger.fine("Context:");

    String p = "";
    for (int i = 0; i < srcSentence.size(); i++) {
        if (i == srcMention.startIndex) {
            p += "[";
        }
        if (i == srcMention.endIndex) {
            p += "]";
        }
        p += srcSentence.get(i).word() + " ";
    }
    logger.fine(p);

    StringBuilder golds = new StringBuilder();
    golds.append("Gold mentions in the sentence:\n");
    Counter<Integer> mBegin = new ClassicCounter<Integer>();
    Counter<Integer> mEnd = new ClassicCounter<Integer>();

    for (Mention m : goldOrderedMentionsBySentence.get(src.get(0))) {
        mBegin.incrementCount(m.startIndex);
        mEnd.incrementCount(m.endIndex);
    }
    List<CoreLabel> l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(src.get(0))
            .get(CoreAnnotations.TokensAnnotation.class);
    for (int i = 0; i < l.size(); i++) {
        for (int j = 0; j < mEnd.getCount(i); j++) {
            golds.append("]");
        }
        for (int j = 0; j < mBegin.getCount(i); j++) {
            golds.append("[");
        }
        golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class));
        golds.append(" ");
    }
    logger.fine(golds.toString());

    printList(logger, "\nAntecedent:" + dstMention.spanToString(), "Gender:" + dstMention.gender.toString(),
            "Number:" + dstMention.number.toString(), "Animacy:" + dstMention.animacy.toString(),
            "Person:" + dstMention.person.toString(), "NER:" + dstMention.nerString,
            "Head:" + dstMention.headString, "Type:" + dstMention.mentionType.toString(),
            "utter: " + dstMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class),
            "speakerID: " + dstMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class),
            "twinless:" + dstMention.twinless);
    logger.fine("Context:");

    p = "";
    for (int i = 0; i < dstSentence.size(); i++) {
        if (i == dstMention.startIndex) {
            p += "[";
        }
        if (i == dstMention.endIndex) {
            p += "]";
        }
        p += dstSentence.get(i).word() + " ";
    }
    logger.fine(p);

    golds = new StringBuilder();
    golds.append("Gold mentions in the sentence:\n");
    mBegin = new ClassicCounter<Integer>();
    mEnd = new ClassicCounter<Integer>();

    for (Mention m : goldOrderedMentionsBySentence.get(dst.get(0))) {
        mBegin.incrementCount(m.startIndex);
        mEnd.incrementCount(m.endIndex);
    }
    l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(dst.get(0))
            .get(CoreAnnotations.TokensAnnotation.class);
    for (int i = 0; i < l.size(); i++) {
        for (int j = 0; j < mEnd.getCount(i); j++) {
            golds.append("]");
        }
        for (int j = 0; j < mBegin.getCount(i); j++) {
            golds.append("[");
        }
        golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class));
        golds.append(" ");
    }
    logger.fine(golds.toString());

    logger.finer("\nMention:: --------------------------------------------------------");
    try {
        logger.finer(srcMention.dependency.toString());
    } catch (Exception e) {
    } //throw new RuntimeException(e);}
    logger.finer("Parse:");
    logger.finer(formatPennTree(srcMention.contextParseTree));
    logger.finer("\nAntecedent:: -----------------------------------------------------");
    try {
        logger.finer(dstMention.dependency.toString());
    } catch (Exception e) {
    } //throw new RuntimeException(e);}
    logger.finer("Parse:");
    logger.finer(formatPennTree(dstMention.contextParseTree));
}

From source file:knu.univ.lingvo.coref.SieveCoreferenceSystem.java

License:Open Source License

/**
 * Print raw document for analysis/*from   w w  w  . j  a  v a  2s  .  c  om*/
 */
public static void printRawDoc(Document document, boolean gold) throws FileNotFoundException {
    List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<List<Mention>> allMentions;
    if (gold) {
        allMentions = document.goldOrderedMentionsBySentence;
    } else {
        allMentions = document.predictedOrderedMentionsBySentence;
    }
    //    String filename = document.annotation.get()

    StringBuilder doc = new StringBuilder();
    int previousOffset = 0;

    for (int i = 0; i < sentences.size(); i++) {
        CoreMap sentence = sentences.get(i);
        List<Mention> mentions = allMentions.get(i);

        List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
        String[] tokens = new String[t.size()];
        for (CoreLabel c : t) {
            tokens[c.index() - 1] = c.word();
        }
        if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
            doc.append("\n");
        }
        previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        Counter<Integer> startCounts = new ClassicCounter<Integer>();
        Counter<Integer> endCounts = new ClassicCounter<Integer>();
        Map<Integer, Set<Mention>> endMentions = Generics.newHashMap();
        for (Mention m : mentions) {
            startCounts.incrementCount(m.startIndex);
            endCounts.incrementCount(m.endIndex);
            if (!endMentions.containsKey(m.endIndex)) {
                endMentions.put(m.endIndex, Generics.<Mention>newHashSet());
            }
            endMentions.get(m.endIndex).add(m);
        }
        for (int j = 0; j < tokens.length; j++) {
            if (endMentions.containsKey(j)) {
                for (Mention m : endMentions.get(j)) {
                    int corefChainId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
                    doc.append("]_").append(corefChainId);
                }
            }
            for (int k = 0; k < startCounts.getCount(j); k++) {
                if (doc.length() > 0 && doc.charAt(doc.length() - 1) != '[') {
                    doc.append(" ");
                }
                doc.append("[");
            }
            if (doc.length() > 0 && doc.charAt(doc.length() - 1) != '[') {
                doc.append(" ");
            }
            doc.append(tokens[j]);
        }
        if (endMentions.containsKey(tokens.length)) {
            for (Mention m : endMentions.get(tokens.length)) {
                int corefChainId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
                doc.append("]_").append(corefChainId); //append("_").append(m.mentionID);
            }
        }

        doc.append("\n");
    }
    logger.fine(document.annotation.get(CoreAnnotations.DocIDAnnotation.class));
    if (gold) {
        logger.fine("New DOC: (GOLD MENTIONS) ==================================================");
    } else {
        logger.fine("New DOC: (Predicted Mentions) ==================================================");
    }
    logger.fine(doc.toString());
}