List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable
public DoubleWritable()
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static Map<String, Double> readLabelDocumentCounts(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> labelDocumentCounts = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // the key is either _label_ or label,feature while (reader.next(key, value)) { // Count of Documents in a Label if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) { labelDocumentCounts.put(key.stringAt(1), value.get()); }//from w w w . j ava 2 s. co m } } return labelDocumentCounts; }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static double readSigmaJSigmaK(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> weightSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { if (weightSum.size() > 1) { throw new IOException("Incorrect Sum File"); } else if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) { weightSum.put(BayesConstants.TOTAL_SUM, value.get()); }//from w w w . ja va 2 s.c o m } } return weightSum.get(BayesConstants.TOTAL_SUM); }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static double readVocabCount(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> weightSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { if (weightSum.size() > 1) { throw new IOException("Incorrect vocabCount File"); }// w w w .j a v a2s . c om if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) { weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get()); } } } return weightSum.get(BayesConstants.FEATURE_SET_SIZE); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java
License:Apache License
public static ConfusionMatrix readResult(FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); String defaultLabel = params.get("defaultCat"); FileStatus[] outputFiles = fs.globStatus(pathPattern); Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>(); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { String correctLabel = key.stringAt(1); String classifiedLabel = key.stringAt(2); Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel); if (rowMatrix == null) { rowMatrix = new HashMap<String, Integer>(); }//w w w . j av a 2s . co m Integer count = Double.valueOf(value.get()).intValue(); rowMatrix.put(classifiedLabel, count); confusionMatrix.put(correctLabel, rowMatrix); } } ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel); for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) { Map<String, Integer> rowMatrix = correctLabelSet.getValue(); for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) { matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey()); matrix.putCount(correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue()); } } return matrix; }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
static LDAState createState(Configuration job) throws IOException { String statePath = job.get(STATE_IN_KEY); int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY)); int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY)); double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY)); Path dir = new Path(statePath); FileSystem fs = dir.getFileSystem(job); DenseMatrix pWgT = new DenseMatrix(numTopics, numWords); double[] logTotals = new double[numTopics]; double ll = 0.0; IntPairWritable key = new IntPairWritable(); DoubleWritable value = new DoubleWritable(); for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) { Path path = status.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job); while (reader.next(key, value)) { int topic = key.getFirst(); int word = key.getSecond(); if (word == TOPIC_SUM_KEY) { logTotals[topic] = value.get(); if (Double.isInfinite(value.get())) { throw new IllegalArgumentException(); }/* www . ja v a 2 s . c om*/ } else if (topic == LOG_LIKELIHOOD_KEY) { ll = value.get(); } else { if (!((topic >= 0) && (word >= 0))) { throw new IllegalArgumentException(topic + " " + word); } if (pWgT.getQuick(topic, word) != 0.0) { throw new IllegalArgumentException(); } pWgT.setQuick(topic, word, value.get()); if (Double.isInfinite(pWgT.getQuick(topic, word))) { throw new IllegalArgumentException(); } } } reader.close(); } return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll); }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
private void writeInitialState(Path statePath, int numTopics, int numWords) throws IOException { Configuration job = new Configuration(); FileSystem fs = statePath.getFileSystem(job); DoubleWritable v = new DoubleWritable(); Random random = RandomUtils.getRandom(); for (int k = 0; k < numTopics; ++k) { Path path = new Path(statePath, "part-" + k); SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, IntPairWritable.class, DoubleWritable.class); double total = 0.0; // total number of pseudo counts we made for (int w = 0; w < numWords; ++w) { IntPairWritable kw = new IntPairWritable(k, w); // A small amount of random noise, minimized by having a floor. double pseudocount = random.nextDouble() + 1.0E-8; total += pseudocount;/* w w w . ja v a 2s . c om*/ v.set(Math.log(pseudocount)); writer.append(kw, v); } IntPairWritable kTsk = new IntPairWritable(k, TOPIC_SUM_KEY); v.set(Math.log(total)); writer.append(kTsk, v); writer.close(); } }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
private double findLL(Path statePath, Configuration job) throws IOException { FileSystem fs = statePath.getFileSystem(job); double ll = 0.0; IntPairWritable key = new IntPairWritable(); DoubleWritable value = new DoubleWritable(); for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) { Path path = status.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job); while (reader.next(key, value)) { if (key.getFirst() == LOG_LIKELIHOOD_KEY) { ll = value.get();/*from www .j a v a 2 s.c om*/ break; } } reader.close(); } return ll; }
From source file:org.apache.mahout.clustering.lda.LDAMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, VectorWritable wordCountsWritable, Context context) throws IOException, InterruptedException { Vector wordCounts = wordCountsWritable.get(); LDAInference.InferredDocument doc;/* w ww.j ava 2 s . c o m*/ try { doc = infer.infer(wordCounts); } catch (ArrayIndexOutOfBoundsException e1) { throw new IllegalStateException("This is probably because the --numWords argument is set too small. \n" + "\tIt needs to be >= than the number of words (terms actually) in the corpus and can be \n" + "\tlarger if some storage inefficiency can be tolerated.", e1); } double[] logTotals = new double[state.getNumTopics()]; Arrays.fill(logTotals, Double.NEGATIVE_INFINITY); // Output sufficient statistics for each word. == pseudo-log counts. DoubleWritable v = new DoubleWritable(); for (Iterator<Vector.Element> iter = wordCounts.iterateNonZero(); iter.hasNext();) { Vector.Element e = iter.next(); int w = e.index(); for (int k = 0; k < state.getNumTopics(); ++k) { v.set(doc.phi(k, w) + Math.log(e.get())); IntPairWritable kw = new IntPairWritable(k, w); // ouput (topic, word)'s logProb contribution context.write(kw, v); logTotals[k] = LDAUtil.logSum(logTotals[k], v.get()); } } // Output the totals for the statistics. This is to make // normalizing a lot easier. for (int k = 0; k < state.getNumTopics(); ++k) { IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY); v.set(logTotals[k]); assert !Double.isNaN(v.get()); context.write(kw, v); } IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY); // Output log-likelihoods. v.set(doc.getLogLikelihood()); context.write(llk, v); }
From source file:org.apache.mahout.clustering.lda.LDAWordTopicMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, VectorWritable wordCountsWritable, Context context) throws IOException, InterruptedException { Vector wordCounts = wordCountsWritable.get(); LDAInference.InferredDocument doc;/*from w w w. j av a2s . com*/ try { doc = infer.infer(wordCounts); } catch (ArrayIndexOutOfBoundsException e1) { throw new IllegalStateException("This is probably because the --numWords argument is set too small. \n" + "\tIt needs to be >= than the number of words (terms actually) in the corpus and can be \n" + "\tlarger if some storage inefficiency can be tolerated.", e1); } double[] logTotals = new double[state.getNumTopics()]; Arrays.fill(logTotals, Double.NEGATIVE_INFINITY); // Output sufficient statistics for each word. == pseudo-log counts. DoubleWritable v = new DoubleWritable(); for (Iterator<Vector.Element> iter = wordCounts.iterateNonZero(); iter.hasNext();) { Vector.Element e = iter.next(); int w = e.index(); for (int k = 0; k < state.getNumTopics(); ++k) { v.set(doc.phi(k, w) + Math.log(e.get())); IntPairWritable kw = new IntPairWritable(k, w); // output (topic, word)'s logProb contribution context.write(kw, v); logTotals[k] = LDAUtil.logSum(logTotals[k], v.get()); } } // Output the totals for the statistics. This is to make // normalizing a lot easier. for (int k = 0; k < state.getNumTopics(); ++k) { IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY); v.set(logTotals[k]); assert !Double.isNaN(v.get()); context.write(kw, v); } IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY); // Output log-likelihoods. v.set(doc.getLogLikelihood()); context.write(llk, v); }
From source file:org.apache.mahout.ga.watchmaker.OutputUtils.java
License:Apache License
/** * Reads back the evaluations.// w w w.j a v a2 s. c o m * * @param outpath * output <code>Path</code> * @param evaluations * List of evaluations */ public static void importEvaluations(FileSystem fs, Configuration conf, Path outpath, List<Double> evaluations) throws IOException { Sorter sorter = new Sorter(fs, LongWritable.class, DoubleWritable.class, conf); // merge and sort the outputs Path[] outfiles = listOutputFiles(fs, outpath); Path output = new Path(outpath, "output.sorted"); sorter.merge(outfiles, output); // import the evaluations LongWritable key = new LongWritable(); DoubleWritable value = new DoubleWritable(); Reader reader = new Reader(fs, output, conf); try { while (reader.next(key, value)) { evaluations.add(value.get()); } } finally { reader.close(); } }