List of usage examples for org.apache.hadoop.io DoubleWritable get
public double get()
From source file:org.apache.mahout.classifier.cbayes.CBayesThetaMapper.java
License:Apache License
/** * We need to calculate the idf of each feature in each label * * @param key The label,feature pair (can either be the freq Count or the term Document count */// w ww . j a v a 2s .com @Override public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { String labelFeaturePair = key.toString(); if (labelFeaturePair.charAt(0) == ',') { // if it is from the Sigma_j folder (feature weight Sum) String feature = labelFeaturePair.substring(1); double alpha_i = 1.0; for (Map.Entry<String, Double> stringDoubleEntry : labelWeightSum.entrySet()) { double inverseDenominator = 1.0 / (sigma_jSigma_k - stringDoubleEntry.getValue() + vocabCount); DoubleWritable weight = new DoubleWritable((value.get() + alpha_i) * inverseDenominator); output.collect(new Text((stringDoubleEntry.getKey() + ',' + feature).trim()), weight); //output Sigma_j } } else { String label = labelFeaturePair.split(",")[0]; double inverseDenominator = 1.0 / (sigma_jSigma_k - labelWeightSum.get(label) + vocabCount); DoubleWritable weight = new DoubleWritable(-value.get() * inverseDenominator); output.collect(key, weight);//output -D_ij } }
From source file:org.apache.mahout.classifier.cbayes.CBayesThetaNormalizerMapper.java
License:Apache License
/** * We need to calculate the idf of each feature in each label * * @param key The label,feature pair (can either be the freq Count or the term Document count *//*from w ww. ja v a 2 s . co m*/ @Override public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { String labelFeaturePair = key.toString(); if (labelFeaturePair.charAt(0) == ',') { // if it is from the Sigma_j folder double alpha_i = 1.0; for (Map.Entry<String, Double> stringDoubleEntry : labelWeightSum.entrySet()) { double weight = Math.log( (value.get() + alpha_i) / (sigma_jSigma_k - stringDoubleEntry.getValue() + vocabCount)); output.collect(new Text(('_' + stringDoubleEntry.getKey()).trim()), new DoubleWritable(weight)); //output Sigma_j } } else { String label = labelFeaturePair.split(",")[0]; double D_ij = value.get(); double denominator = 0.5 * ((sigma_jSigma_k / vocabCount) + (D_ij * this.labelWeightSum.size())); double weight = Math.log(1.0 - D_ij / denominator); output.collect(new Text(('_' + label).trim()), new DoubleWritable(weight));//output -D_ij } }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
protected static void WriteModelToDirectory(HmmModel model, Path modelPath, Configuration conf) throws IOException { int numHidden = model.getNrOfHiddenStates(); int numObserved = model.getNrOfOutputStates(); Matrix emissionMatrix = model.getEmissionMatrix(); Matrix transitionMatrix = model.getTransitionMatrix(); Vector initialProbability = model.getInitialProbabilities(); MapWritable initialDistributionMap = new MapWritable(); MapWritable transitionDistributionMap = new MapWritable(); MapWritable emissionDistributionMap = new MapWritable(); // delete the output directory HadoopUtil.delete(conf, modelPath);//ww w .j a v a 2s . c o m // create new file to store HMM FileSystem fs = FileSystem.get(modelPath.toUri(), conf); Path outFile = new Path(modelPath, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, MapWritable.class); try { // construct one MapWritable<IntWritable, DoubleWritable> object // and two MapWritable<Text, MapWritable<IntWritable, DoubleWritable >> objects for (int i = 0; i < numHidden; i++) { IntWritable initialDistributionKey = new IntWritable(i); DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i)); log.info("BuildRandomModel Initial Distribution Map: State {} = {})", initialDistributionKey.get(), initialDistributionValue.get()); initialDistributionMap.put(initialDistributionKey, initialDistributionValue); Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i)); MapWritable transitionDistributionValue = new MapWritable(); for (int j = 0; j < numHidden; j++) { IntWritable transitionDistributionInnerKey = new IntWritable(j); DoubleWritable transitionDistributionInnerValue = new DoubleWritable( transitionMatrix.get(i, j)); log.info("BuildRandomModel Transition Distribution Map Inner: ({}, {}) = ({}, {})", new Object[] { i, j, transitionDistributionInnerKey.get(), transitionDistributionInnerValue.get() }); transitionDistributionValue.put(transitionDistributionInnerKey, transitionDistributionInnerValue); } transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue); Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i)); MapWritable emissionDistributionValue = new MapWritable(); for (int j = 0; j < numObserved; j++) { IntWritable emissionDistributionInnerKey = new IntWritable(j); DoubleWritable emissionDistributionInnerValue = new DoubleWritable( emissionMatrix.get(i, j)); log.info("BuildRandomModel Emission Distribution Map Inner: ({}, {}) = ({}, {})", new Object[] { i, j, emissionDistributionInnerKey.get(), emissionDistributionInnerValue.get() }); emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue); } emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue); } writer.append(new Text("INITIAL"), initialDistributionMap); log.info("Wrote random Initial Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) { log.info("Writing Transition Distribution Map Key, Value = ({}, {})", transitionEntry.getKey(), transitionEntry.getValue()); writer.append(transitionEntry.getKey(), transitionEntry.getValue()); } log.info("Wrote random Transition Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) { log.info("Writing Emission Distribution Map Key, Value = ({}, {})", emissionEntry.getKey(), emissionEntry.getValue()); writer.append(emissionEntry.getKey(), emissionEntry.getValue()); } log.info("Wrote random Emission Distribution Map to {}", outFile); } finally { Closeables.closeQuietly(writer); } } }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
static LDAState createState(Configuration job) throws IOException { String statePath = job.get(STATE_IN_KEY); int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY)); int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY)); double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY)); Path dir = new Path(statePath); FileSystem fs = dir.getFileSystem(job); DenseMatrix pWgT = new DenseMatrix(numTopics, numWords); double[] logTotals = new double[numTopics]; double ll = 0.0; IntPairWritable key = new IntPairWritable(); DoubleWritable value = new DoubleWritable(); for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) { Path path = status.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job); while (reader.next(key, value)) { int topic = key.getFirst(); int word = key.getSecond(); if (word == TOPIC_SUM_KEY) { logTotals[topic] = value.get(); if (Double.isInfinite(value.get())) { throw new IllegalArgumentException(); }/* w w w . j a va 2 s .c om*/ } else if (topic == LOG_LIKELIHOOD_KEY) { ll = value.get(); } else { if (!((topic >= 0) && (word >= 0))) { throw new IllegalArgumentException(topic + " " + word); } if (pWgT.getQuick(topic, word) != 0.0) { throw new IllegalArgumentException(); } pWgT.setQuick(topic, word, value.get()); if (Double.isInfinite(pWgT.getQuick(topic, word))) { throw new IllegalArgumentException(); } } } reader.close(); } return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll); }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
private double findLL(Path statePath, Configuration job) throws IOException { FileSystem fs = statePath.getFileSystem(job); double ll = 0.0; IntPairWritable key = new IntPairWritable(); DoubleWritable value = new DoubleWritable(); for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) { Path path = status.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job); while (reader.next(key, value)) { if (key.getFirst() == LOG_LIKELIHOOD_KEY) { ll = value.get(); break; }//from w w w . ja v a2 s .c o m } reader.close(); } return ll; }
From source file:org.apache.mahout.clustering.lda.LDAMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, VectorWritable wordCountsWritable, Context context) throws IOException, InterruptedException { Vector wordCounts = wordCountsWritable.get(); LDAInference.InferredDocument doc;/*w w w . ja va 2s . co m*/ try { doc = infer.infer(wordCounts); } catch (ArrayIndexOutOfBoundsException e1) { throw new IllegalStateException("This is probably because the --numWords argument is set too small. \n" + "\tIt needs to be >= than the number of words (terms actually) in the corpus and can be \n" + "\tlarger if some storage inefficiency can be tolerated.", e1); } double[] logTotals = new double[state.getNumTopics()]; Arrays.fill(logTotals, Double.NEGATIVE_INFINITY); // Output sufficient statistics for each word. == pseudo-log counts. DoubleWritable v = new DoubleWritable(); for (Iterator<Vector.Element> iter = wordCounts.iterateNonZero(); iter.hasNext();) { Vector.Element e = iter.next(); int w = e.index(); for (int k = 0; k < state.getNumTopics(); ++k) { v.set(doc.phi(k, w) + Math.log(e.get())); IntPairWritable kw = new IntPairWritable(k, w); // ouput (topic, word)'s logProb contribution context.write(kw, v); logTotals[k] = LDAUtil.logSum(logTotals[k], v.get()); } } // Output the totals for the statistics. This is to make // normalizing a lot easier. for (int k = 0; k < state.getNumTopics(); ++k) { IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY); v.set(logTotals[k]); assert !Double.isNaN(v.get()); context.write(kw, v); } IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY); // Output log-likelihoods. v.set(doc.getLogLikelihood()); context.write(llk, v); }
From source file:org.apache.mahout.clustering.lda.LDAReducer.java
License:Apache License
@Override public void reduce(IntPairWritable topicWord, Iterable<DoubleWritable> values, Context context) throws java.io.IOException, InterruptedException { // sum likelihoods if (topicWord.getSecond() == LDADriver.LOG_LIKELIHOOD_KEY) { double accum = 0.0; for (DoubleWritable vw : values) { double v = vw.get(); if (Double.isNaN(v)) { throw new IllegalArgumentException(topicWord.getFirst() + " " + topicWord.getSecond()); }/*from ww w .j a va2 s . c o m*/ accum += v; } context.write(topicWord, new DoubleWritable(accum)); } else { // log sum sufficient statistics. double accum = Double.NEGATIVE_INFINITY; for (DoubleWritable vw : values) { double v = vw.get(); if (Double.isNaN(v)) { throw new IllegalArgumentException(topicWord.getFirst() + " " + topicWord.getSecond()); } accum = LDAUtil.logSum(accum, v); if (Double.isNaN(accum)) { throw new IllegalArgumentException(topicWord.getFirst() + " " + topicWord.getSecond()); } } context.write(topicWord, new DoubleWritable(accum)); } }
From source file:org.apache.mahout.clustering.lda.LDAWordTopicMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, VectorWritable wordCountsWritable, Context context) throws IOException, InterruptedException { Vector wordCounts = wordCountsWritable.get(); LDAInference.InferredDocument doc;/* w ww . j a v a2s . c om*/ try { doc = infer.infer(wordCounts); } catch (ArrayIndexOutOfBoundsException e1) { throw new IllegalStateException("This is probably because the --numWords argument is set too small. \n" + "\tIt needs to be >= than the number of words (terms actually) in the corpus and can be \n" + "\tlarger if some storage inefficiency can be tolerated.", e1); } double[] logTotals = new double[state.getNumTopics()]; Arrays.fill(logTotals, Double.NEGATIVE_INFINITY); // Output sufficient statistics for each word. == pseudo-log counts. DoubleWritable v = new DoubleWritable(); for (Iterator<Vector.Element> iter = wordCounts.iterateNonZero(); iter.hasNext();) { Vector.Element e = iter.next(); int w = e.index(); for (int k = 0; k < state.getNumTopics(); ++k) { v.set(doc.phi(k, w) + Math.log(e.get())); IntPairWritable kw = new IntPairWritable(k, w); // output (topic, word)'s logProb contribution context.write(kw, v); logTotals[k] = LDAUtil.logSum(logTotals[k], v.get()); } } // Output the totals for the statistics. This is to make // normalizing a lot easier. for (int k = 0; k < state.getNumTopics(); ++k) { IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY); v.set(logTotals[k]); assert !Double.isNaN(v.get()); context.write(kw, v); } IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY); // Output log-likelihoods. v.set(doc.getLogLikelihood()); context.write(llk, v); }
From source file:org.apache.mahout.ga.watchmaker.OutputUtils.java
License:Apache License
/** * Reads back the evaluations.//w ww .j a v a 2 s.c o m * * @param outpath * output <code>Path</code> * @param evaluations * List of evaluations */ public static void importEvaluations(FileSystem fs, Configuration conf, Path outpath, List<Double> evaluations) throws IOException { Sorter sorter = new Sorter(fs, LongWritable.class, DoubleWritable.class, conf); // merge and sort the outputs Path[] outfiles = listOutputFiles(fs, outpath); Path output = new Path(outpath, "output.sorted"); sorter.merge(outfiles, output); // import the evaluations LongWritable key = new LongWritable(); DoubleWritable value = new DoubleWritable(); Reader reader = new Reader(fs, output, conf); try { while (reader.next(key, value)) { evaluations.add(value.get()); } } finally { reader.close(); } }
From source file:org.apache.mahout.math.hadoop.stats.StandardDeviationCalculatorReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { double sum = 0.0; for (DoubleWritable value : values) { sum += value.get(); }/*from w w w . j a va 2 s . co m*/ context.write(key, new DoubleWritable(sum)); }