List of usage examples for org.apache.hadoop.io DoubleWritable get
public double get()
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadThetaNormalizer(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); long count = 0; while (reader.next(key, value)) { // Sum of weights in a Label if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) { datastore.setThetaNormalizer(key.stringAt(1), value.get()); count++;/* w w w . j av a 2s.c om*/ if (count % 50000 == 0) { log.info("Read {} theta norms", count); } } } } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static void loadSumWeight(InMemoryBayesDatastore datastore, FileSystem fs, Path pathPattern, Configuration conf) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); log.info("{}", path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is _label while (reader.next(key, value)) { if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) { // Sum of // weights for // all Features and all Labels datastore.setSigmaJSigmaK(value.get()); log.info("{}", value.get()); }//from w ww.ja v a 2 s .c om } } }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static Map<String, Double> readLabelSums(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> labelSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is either _label_ or label,feature while (reader.next(key, value)) { if (key.stringAt(0).equals(BayesConstants.LABEL_SUM)) { // Sum of counts // of labels labelSum.put(key.stringAt(1), value.get()); }//from w ww .j ava2 s . co m } } return labelSum; }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static Map<String, Double> readLabelDocumentCounts(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> labelDocumentCounts = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is either _label_ or label,feature while (reader.next(key, value)) { // Count of Documents in a Label if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) { labelDocumentCounts.put(key.stringAt(1), value.get()); }//w w w . j av a 2s . co m } } return labelDocumentCounts; }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static double readSigmaJSigmaK(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> weightSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { if (weightSum.size() > 1) { throw new IOException("Incorrect Sum File"); } else if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) { weightSum.put(BayesConstants.TOTAL_SUM, value.get()); }// ww w. ja va2 s .c om } } return weightSum.get(BayesConstants.TOTAL_SUM); }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static double readVocabCount(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> weightSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { if (weightSum.size() > 1) { throw new IOException("Incorrect vocabCount File"); }/*from w w w . ja va2 s .c o m*/ if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) { weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get()); } } } return weightSum.get(BayesConstants.FEATURE_SET_SIZE); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java
License:Apache License
public static ConfusionMatrix readResult(FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); String defaultLabel = params.get("defaultCat"); FileStatus[] outputFiles = fs.globStatus(pathPattern); Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>(); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { String correctLabel = key.stringAt(1); String classifiedLabel = key.stringAt(2); Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel); if (rowMatrix == null) { rowMatrix = new HashMap<String, Integer>(); }/* ww w . java 2 s . c o m*/ Integer count = Double.valueOf(value.get()).intValue(); rowMatrix.put(classifiedLabel, count); confusionMatrix.put(correctLabel, rowMatrix); } } ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel); for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) { Map<String, Integer> rowMatrix = correctLabelSet.getValue(); for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) { matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey()); matrix.putCount(correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue()); } } return matrix; }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper.java
License:Apache License
/** * We need to calculate the thetaNormalization factor of each label * /*from www .jav a 2s . co m*/ * @param key * The label,feature pair * @param value * The tfIdf of the pair */ @Override public void map(StringTuple key, DoubleWritable value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { String label = key.stringAt(1); reporter.setStatus("Bayes Theta Normalizer Mapper: " + label); double weight = Math.log((value.get() + alphaI) / (labelWeightSum.get(label) + vocabCount)); StringTuple thetaNormalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER); thetaNormalizerTuple.add(label); output.collect(thetaNormalizerTuple, new DoubleWritable(weight)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper.java
License:Apache License
/** * We need to calculate the idf of each feature in each label * /* w w w. j a va2 s.c o m*/ * @param key * The label,feature pair (can either be the freq Count or the term Document count */ @Override public void map(StringTuple key, final DoubleWritable value, final OutputCollector<StringTuple, DoubleWritable> output, final Reporter reporter) throws IOException { if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) { // if it is from // the Sigma_j // folder labelWeightSum.forEachPair(new ObjectDoubleProcedure<String>() { @Override public boolean apply(String label, double sigmaJ) { double weight = Math.log((value.get() + alphaI) / (sigmaJSigmaK - sigmaJ + vocabCount)); reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight); StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER); normalizerTuple.add(label); try { output.collect(normalizerTuple, new DoubleWritable(weight)); } catch (IOException e) { throw new IllegalStateException(e); } // output Sigma_j return true; } }); } else { String label = key.stringAt(1); double dIJ = value.get(); double denominator = 0.5 * (sigmaJSigmaK / vocabCount + dIJ * this.labelWeightSum.size()); double weight = Math.log(1.0 - dIJ / denominator); reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight); StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER); normalizerTuple.add(label); // output -D_ij output.collect(normalizerTuple, new DoubleWritable(weight)); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfMapper.java
License:Apache License
/** * We need to calculate the Tf-Idf of each feature in each label * /* www . j av a 2s. com*/ * @param key * The label,feature pair (can either be the freq Count or the term Document count */ @Override public void map(StringTuple key, DoubleWritable value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { if (key.length() == 3) { if (key.stringAt(0).equals(BayesConstants.WEIGHT)) { reporter.setStatus("Bayes TfIdf Mapper: Tf: " + key); output.collect(key, value); } else if (key.stringAt(0).equals(BayesConstants.DOCUMENT_FREQUENCY)) { String label = key.stringAt(1); Double labelDocumentCount = labelDocumentCounts.get(label); double logIdf = Math.log(labelDocumentCount / value.get()); key.replaceAt(0, BayesConstants.WEIGHT); output.collect(key, new DoubleWritable(logIdf)); reporter.setStatus("Bayes TfIdf Mapper: log(Idf): " + key); } else { throw new IllegalArgumentException("Unrecognized Tuple: " + key); } } else if (key.length() == 2) { if (key.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) { output.collect(VOCAB_COUNT, ONE); reporter.setStatus("Bayes TfIdf Mapper: vocabCount"); } else { throw new IllegalArgumentException("Unexpected Tuple: " + key); } } }