List of usage examples for edu.stanford.nlp.stats ClassicCounter ClassicCounter
public ClassicCounter()
From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java
License:Open Source License
/** * Reads in a model file in svm light format. It needs to know if its multiclass or not * because it affects the number of header lines. Maybe there is another way to tell and we * can remove this flag?/*from w w w. ja v a2s . c o m*/ */ private static Pair<Double, ClassicCounter<Integer>> readModel(File modelFile, boolean multiclass) { int modelLineCount = 0; try { int numLinesToSkip = multiclass ? 13 : 10; String stopToken = "#"; BufferedReader in = new BufferedReader(new FileReader(modelFile)); for (int i = 0; i < numLinesToSkip; i++) { in.readLine(); modelLineCount++; } List<Pair<Double, ClassicCounter<Integer>>> supportVectors = new ArrayList<Pair<Double, ClassicCounter<Integer>>>(); // Read Threshold String thresholdLine = in.readLine(); modelLineCount++; String[] pieces = thresholdLine.split("\\s+"); double threshold = Double.parseDouble(pieces[0]); // Read Support Vectors while (in.ready()) { String svLine = in.readLine(); modelLineCount++; pieces = svLine.split("\\s+"); // First Element is the alpha_i * y_i double alpha = Double.parseDouble(pieces[0]); ClassicCounter<Integer> supportVector = new ClassicCounter<Integer>(); for (int i = 1; i < pieces.length; ++i) { String piece = pieces[i]; if (piece.equals(stopToken)) break; // Each in featureIndex:num class String[] indexNum = piece.split(":"); String featureIndex = indexNum[0]; // mihai: we may see "qid" as indexNum[0]. just skip this piece. this is the block id useful only for reranking, which we don't do here. if (!featureIndex.equals("qid")) { double count = Double.parseDouble(indexNum[1]); supportVector.incrementCount(Integer.valueOf(featureIndex), count); } } supportVectors.add(new Pair<Double, ClassicCounter<Integer>>(alpha, supportVector)); } in.close(); return new Pair<Double, ClassicCounter<Integer>>(threshold, getWeights(supportVectors)); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Error reading SVM model (line " + modelLineCount + " in file " + modelFile.getAbsolutePath() + ")"); } }
From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java
License:Open Source License
/** * Takes all the support vectors, and their corresponding alphas, and computes a weight * vector that can be used in a vanilla LinearClassifier. This only works because * we are using a linear kernel. The Counter is over the feature indices (+1 cos for * some reason svm_light is 1-indexed), not features. *///www.ja va 2 s .com private static ClassicCounter<Integer> getWeights(List<Pair<Double, ClassicCounter<Integer>>> supportVectors) { ClassicCounter<Integer> weights = new ClassicCounter<Integer>(); for (Pair<Double, ClassicCounter<Integer>> sv : supportVectors) { ClassicCounter<Integer> c = new ClassicCounter<Integer>(sv.second()); Counters.multiplyInPlace(c, sv.first()); Counters.addInPlace(weights, c); } return weights; }
From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java
License:Open Source License
/** * Converts the svm_light weight Counter (which uses feature indices) into a weight Counter * using the actual features and labels. Because this is svm_light, and not svm_struct, the * weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class * (which correspond to labelIndex.get(1)) are just the negation of one another. */// w w w . j a va2 s . co m private ClassicCounter<Pair<F, L>> convertSVMLightWeights(ClassicCounter<Integer> weights, Index<F> featureIndex, Index<L> labelIndex) { ClassicCounter<Pair<F, L>> newWeights = new ClassicCounter<Pair<F, L>>(); for (int i : weights.keySet()) { F f = featureIndex.get(i - 1); double w = weights.getCount(i); // the first guy in the labelIndex was the +1 class and the second guy // was the -1 class newWeights.incrementCount(new Pair<F, L>(f, labelIndex.get(0)), w); newWeights.incrementCount(new Pair<F, L>(f, labelIndex.get(1)), -w); } return newWeights; }
From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java
License:Open Source License
/** * Converts the svm_struct weight Counter (in which the weight for a feature/label pair * correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter * using the actual features and labels. *///from w w w. j a v a 2s. c o m private ClassicCounter<Pair<F, L>> convertSVMStructWeights(ClassicCounter<Integer> weights, Index<F> featureIndex, Index<L> labelIndex) { // int numLabels = labelIndex.size(); int numFeatures = featureIndex.size(); ClassicCounter<Pair<F, L>> newWeights = new ClassicCounter<Pair<F, L>>(); for (int i : weights.keySet()) { L l = labelIndex.get((i - 1) / numFeatures); // integer division on purpose F f = featureIndex.get((i - 1) % numFeatures); double w = weights.getCount(i); newWeights.incrementCount(new Pair<F, L>(f, l), w); } return newWeights; }
From source file:gr.aueb.cs.nlp.wordtagger.classifier.SVMWindows64Factory.java
License:Open Source License
public SVMLightClassifier<L, F> trainClassifierBasic(GeneralDataset<L, F> dataset) { Index<L> labelIndex = dataset.labelIndex(); Index<F> featureIndex = dataset.featureIndex; boolean multiclass = (dataset.numClasses() > 2); try {//from w w w. j a v a2 s. com // this is the file that the model will be saved to File modelFile = File.createTempFile("svm-", ".model"); if (deleteTempFilesOnExit) { modelFile.deleteOnExit(); } // this is the file that the svm light formated dataset // will be printed to File dataFile = File.createTempFile("svm-", ".data"); if (deleteTempFilesOnExit) { dataFile.deleteOnExit(); } // print the dataset PrintWriter pw = new PrintWriter(new FileWriter(dataFile)); dataset.printSVMLightFormat(pw); pw.close(); // -v 0 makes it not verbose // -m 400 gives it a larger cache, for faster training String cmd = (multiclass ? svmStructLearn : (useSVMPerf ? svmPerfLearn : svmLightLearn)) + " -v " + svmLightVerbosity + " -m 5000 -w 3 -t 0 -g 7 "; // set the value of C if we have one specified if (C > 0.0) cmd = cmd + " -c " + C + " "; // C value else if (useSVMPerf) cmd = cmd + " -c " + 0.01 + " "; //It's required to specify this parameter for SVM perf // Alpha File if (useAlphaFile) { File newAlphaFile = File.createTempFile("svm-", ".alphas"); if (deleteTempFilesOnExit) { newAlphaFile.deleteOnExit(); } cmd = cmd + " -a " + newAlphaFile.getAbsolutePath(); if (alphaFile != null) { cmd = cmd + " -y " + alphaFile.getAbsolutePath(); } alphaFile = newAlphaFile; } // File and Model Data cmd = cmd + " " + dataFile.getAbsolutePath() + " " + modelFile.getAbsolutePath(); if (verbose) System.err.println("<< " + cmd + " >>"); /*Process p = Runtime.getRuntime().exec(cmd); p.waitFor(); if (p.exitValue() != 0) throw new RuntimeException("Error Training SVM Light exit value: " + p.exitValue()); p.destroy(); */ SystemUtils.run(new ProcessBuilder(whitespacePattern.split(cmd)), new PrintWriter(System.err), new PrintWriter(System.err)); if (doEval) { File predictFile = File.createTempFile("svm-", ".pred"); if (deleteTempFilesOnExit) { predictFile.deleteOnExit(); } String evalCmd = (multiclass ? svmStructClassify : (useSVMPerf ? svmPerfClassify : svmLightClassify)) + " " + dataFile.getAbsolutePath() + " " + modelFile.getAbsolutePath() + " " + predictFile.getAbsolutePath(); if (verbose) System.err.println("<< " + evalCmd + " >>"); SystemUtils.run(new ProcessBuilder(whitespacePattern.split(evalCmd)), new PrintWriter(System.err), new PrintWriter(System.err)); } // read in the model file Pair<Double, ClassicCounter<Integer>> weightsAndThresh = readModel(modelFile, multiclass); double threshold = weightsAndThresh.first(); ClassicCounter<Pair<F, L>> weights = convertWeights(weightsAndThresh.second(), featureIndex, labelIndex, multiclass); ClassicCounter<L> thresholds = new ClassicCounter<L>(); if (!multiclass) { thresholds.setCount(labelIndex.get(0), -threshold); thresholds.setCount(labelIndex.get(1), threshold); } SVMLightClassifier<L, F> classifier = new SVMLightClassifier<L, F>(weights, thresholds); if (doEval) { File predictFile = File.createTempFile("svm-", ".pred2"); if (deleteTempFilesOnExit) { predictFile.deleteOnExit(); } PrintWriter pw2 = new PrintWriter(predictFile); NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(5); for (Datum<L, F> datum : dataset) { Counter<L> scores = classifier.scoresOf(datum); pw2.println(Counters.toString(scores, nf)); } pw2.close(); } if (useSigmoid) { if (verbose) System.out.print("fitting sigmoid..."); classifier.setPlatt(fitSigmoid(classifier, dataset)); if (verbose) System.out.println("done"); } return classifier; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:gr.aueb.cs.nlp.wordtagger.data.structure.WordSet.java
License:Open Source License
/** * Converts any List with words to a Stanford set; * @param words/*from w w w .j a va 2 s. c o m*/ * @return, a list of real valued datums */ public static List<RVFDatum<String, String>> toStanfordSet(List<Word> words) { List<RVFDatum<String, String>> trainignData = new ArrayList<>(); for (Word w : words) { List<Double> feats = Arrays.asList(ArrayUtils.toObject(w.getFeatureVec().getValues())); ClassicCounter<String> cc = new ClassicCounter<>(); for (int i = 0; i < feats.size(); i++) { cc.incrementCount("feature" + i, feats.get(i)); } if (w.getCategory() != null) { RVFDatum<String, String> dtm = new RVFDatum<>(cc, w.getCategory()); trainignData.add(dtm); } } System.out.println("Converted List to classifier trainset"); return trainignData; }
From source file:gr.aueb.cs.nlp.wordtagger.data.structure.WordSet.java
License:Open Source License
/** * convers a word to a stanforf real valued atum * @param w//from w ww .ja v a 2 s . c om * @return */ public static RVFDatum<String, String> word2Datum(Word w) { List<Double> feats = Arrays.asList(ArrayUtils.toObject(w.getFeatureVec().getValues())); ClassicCounter<String> cc = new ClassicCounter<>(); for (int i = 0; i < feats.size(); i++) { cc.incrementCount("feature" + i, feats.get(i)); } return new RVFDatum<>(cc, w.getCategory()); }
From source file:knu.univ.lingvo.coref.ACEMentionExtractor.java
License:Open Source License
private static void printRawDoc(List<CoreMap> sentences, List<List<Mention>> allMentions, String filename, boolean gold) throws FileNotFoundException { StringBuilder doc = new StringBuilder(); int previousOffset = 0; Counter<Integer> mentionCount = new ClassicCounter<Integer>(); for (List<Mention> l : allMentions) { for (Mention m : l) { mentionCount.incrementCount(m.goldCorefClusterID); }//from w ww.ja v a 2 s . c o m } for (int i = 0; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); List<Mention> mentions = allMentions.get(i); String[] tokens = sentence.get(CoreAnnotations.TextAnnotation.class).split(" "); String sent = ""; List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class); if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) sent += "\n"; previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); Counter<Integer> startCounts = new ClassicCounter<Integer>(); Counter<Integer> endCounts = new ClassicCounter<Integer>(); Map<Integer, Set<Integer>> endID = Generics.newHashMap(); for (Mention m : mentions) { startCounts.incrementCount(m.startIndex); endCounts.incrementCount(m.endIndex); if (!endID.containsKey(m.endIndex)) endID.put(m.endIndex, Generics.<Integer>newHashSet()); endID.get(m.endIndex).add(m.goldCorefClusterID); } for (int j = 0; j < tokens.length; j++) { if (endID.containsKey(j)) { for (Integer id : endID.get(j)) { if (mentionCount.getCount(id) != 1 && gold) sent += "]_" + id; else sent += "]"; } } for (int k = 0; k < startCounts.getCount(j); k++) { if (!sent.endsWith("[")) sent += " "; sent += "["; } sent += " "; sent = sent + tokens[j]; } for (int k = 0; k < endCounts.getCount(tokens.length); k++) { sent += "]"; } sent += "\n"; doc.append(sent); } if (gold) logger.fine("New DOC: (GOLD MENTIONS) =================================================="); else logger.fine("New DOC: (Predicted Mentions) =================================================="); logger.fine(doc.toString()); }
From source file:knu.univ.lingvo.coref.SieveCoreferenceSystem.java
License:Open Source License
/** * print a coref link information including context and parse tree *///from w w w . j a v a 2 s . c o m private static void printLinkWithContext(Logger logger, String header, IntTuple src, IntTuple dst, Document document, Semantics semantics) { List<List<Mention>> orderedMentionsBySentence = document.getOrderedMentions(); List<List<Mention>> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence; Mention srcMention = orderedMentionsBySentence.get(src.get(0)).get(src.get(1)); Mention dstMention = orderedMentionsBySentence.get(dst.get(0)).get(dst.get(1)); List<CoreLabel> srcSentence = srcMention.sentenceWords; List<CoreLabel> dstSentence = dstMention.sentenceWords; printLink(logger, header, src, dst, orderedMentionsBySentence); printList(logger, "Mention:" + srcMention.spanToString(), "Gender:" + srcMention.gender.toString(), "Number:" + srcMention.number.toString(), "Animacy:" + srcMention.animacy.toString(), "Person:" + srcMention.person.toString(), "NER:" + srcMention.nerString, "Head:" + srcMention.headString, "Type:" + srcMention.mentionType.toString(), "utter: " + srcMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), "speakerID: " + srcMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), "twinless:" + srcMention.twinless); logger.fine("Context:"); String p = ""; for (int i = 0; i < srcSentence.size(); i++) { if (i == srcMention.startIndex) { p += "["; } if (i == srcMention.endIndex) { p += "]"; } p += srcSentence.get(i).word() + " "; } logger.fine(p); StringBuilder golds = new StringBuilder(); golds.append("Gold mentions in the sentence:\n"); Counter<Integer> mBegin = new ClassicCounter<Integer>(); Counter<Integer> mEnd = new ClassicCounter<Integer>(); for (Mention m : goldOrderedMentionsBySentence.get(src.get(0))) { mBegin.incrementCount(m.startIndex); mEnd.incrementCount(m.endIndex); } List<CoreLabel> l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(src.get(0)) .get(CoreAnnotations.TokensAnnotation.class); for (int i = 0; i < l.size(); i++) { for (int j = 0; j < mEnd.getCount(i); j++) { golds.append("]"); } for (int j = 0; j < mBegin.getCount(i); j++) { golds.append("["); } golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class)); golds.append(" "); } logger.fine(golds.toString()); printList(logger, "\nAntecedent:" + dstMention.spanToString(), "Gender:" + dstMention.gender.toString(), "Number:" + dstMention.number.toString(), "Animacy:" + dstMention.animacy.toString(), "Person:" + dstMention.person.toString(), "NER:" + dstMention.nerString, "Head:" + dstMention.headString, "Type:" + dstMention.mentionType.toString(), "utter: " + dstMention.headWord.get(CoreAnnotations.UtteranceAnnotation.class), "speakerID: " + dstMention.headWord.get(CoreAnnotations.SpeakerAnnotation.class), "twinless:" + dstMention.twinless); logger.fine("Context:"); p = ""; for (int i = 0; i < dstSentence.size(); i++) { if (i == dstMention.startIndex) { p += "["; } if (i == dstMention.endIndex) { p += "]"; } p += dstSentence.get(i).word() + " "; } logger.fine(p); golds = new StringBuilder(); golds.append("Gold mentions in the sentence:\n"); mBegin = new ClassicCounter<Integer>(); mEnd = new ClassicCounter<Integer>(); for (Mention m : goldOrderedMentionsBySentence.get(dst.get(0))) { mBegin.incrementCount(m.startIndex); mEnd.incrementCount(m.endIndex); } l = document.annotation.get(CoreAnnotations.SentencesAnnotation.class).get(dst.get(0)) .get(CoreAnnotations.TokensAnnotation.class); for (int i = 0; i < l.size(); i++) { for (int j = 0; j < mEnd.getCount(i); j++) { golds.append("]"); } for (int j = 0; j < mBegin.getCount(i); j++) { golds.append("["); } golds.append(l.get(i).get(CoreAnnotations.TextAnnotation.class)); golds.append(" "); } logger.fine(golds.toString()); logger.finer("\nMention:: --------------------------------------------------------"); try { logger.finer(srcMention.dependency.toString()); } catch (Exception e) { } //throw new RuntimeException(e);} logger.finer("Parse:"); logger.finer(formatPennTree(srcMention.contextParseTree)); logger.finer("\nAntecedent:: -----------------------------------------------------"); try { logger.finer(dstMention.dependency.toString()); } catch (Exception e) { } //throw new RuntimeException(e);} logger.finer("Parse:"); logger.finer(formatPennTree(dstMention.contextParseTree)); }
From source file:knu.univ.lingvo.coref.SieveCoreferenceSystem.java
License:Open Source License
/** * Print raw document for analysis/*from w w w . j a v a 2s . c om*/ */ public static void printRawDoc(Document document, boolean gold) throws FileNotFoundException { List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class); List<List<Mention>> allMentions; if (gold) { allMentions = document.goldOrderedMentionsBySentence; } else { allMentions = document.predictedOrderedMentionsBySentence; } // String filename = document.annotation.get() StringBuilder doc = new StringBuilder(); int previousOffset = 0; for (int i = 0; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); List<Mention> mentions = allMentions.get(i); List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class); String[] tokens = new String[t.size()]; for (CoreLabel c : t) { tokens[c.index() - 1] = c.word(); } if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { doc.append("\n"); } previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); Counter<Integer> startCounts = new ClassicCounter<Integer>(); Counter<Integer> endCounts = new ClassicCounter<Integer>(); Map<Integer, Set<Mention>> endMentions = Generics.newHashMap(); for (Mention m : mentions) { startCounts.incrementCount(m.startIndex); endCounts.incrementCount(m.endIndex); if (!endMentions.containsKey(m.endIndex)) { endMentions.put(m.endIndex, Generics.<Mention>newHashSet()); } endMentions.get(m.endIndex).add(m); } for (int j = 0; j < tokens.length; j++) { if (endMentions.containsKey(j)) { for (Mention m : endMentions.get(j)) { int corefChainId = (gold) ? m.goldCorefClusterID : m.corefClusterID; doc.append("]_").append(corefChainId); } } for (int k = 0; k < startCounts.getCount(j); k++) { if (doc.length() > 0 && doc.charAt(doc.length() - 1) != '[') { doc.append(" "); } doc.append("["); } if (doc.length() > 0 && doc.charAt(doc.length() - 1) != '[') { doc.append(" "); } doc.append(tokens[j]); } if (endMentions.containsKey(tokens.length)) { for (Mention m : endMentions.get(tokens.length)) { int corefChainId = (gold) ? m.goldCorefClusterID : m.corefClusterID; doc.append("]_").append(corefChainId); //append("_").append(m.mentionID); } } doc.append("\n"); } logger.fine(document.annotation.get(CoreAnnotations.DocIDAnnotation.class)); if (gold) { logger.fine("New DOC: (GOLD MENTIONS) =================================================="); } else { logger.fine("New DOC: (Predicted Mentions) =================================================="); } logger.fine(doc.toString()); }