Example usage for org.apache.hadoop.io DoubleWritable DoubleWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io DoubleWritable DoubleWritable.

Prototype

public DoubleWritable()

Source Link

Usage

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static Map<String, Double> readLabelDocumentCounts(FileSystem fs, Path pathPattern, Configuration conf)
        throws IOException {
    Map<String, Double> labelDocumentCounts = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // the key is either _label_ or label,feature
        while (reader.next(key, value)) {
            // Count of Documents in a Label
            if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) {
                labelDocumentCounts.put(key.stringAt(1), value.get());
            }//from  w w  w  .  j ava  2 s. co  m

        }
    }

    return labelDocumentCounts;
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static double readSigmaJSigmaK(FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            if (weightSum.size() > 1) {
                throw new IOException("Incorrect Sum File");
            } else if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) {
                weightSum.put(BayesConstants.TOTAL_SUM, value.get());
            }//from  w w  w  . ja  va 2 s.c  o m

        }
    }

    return weightSum.get(BayesConstants.TOTAL_SUM);
}

From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java

License:Apache License

public static double readVocabCount(FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
    Map<String, Double> weightSum = new HashMap<String, Double>();
    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();

    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            if (weightSum.size() > 1) {
                throw new IOException("Incorrect vocabCount File");
            }// w w  w .j a  v a2s  .  c om
            if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) {
                weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get());
            }

        }
    }

    return weightSum.get(BayesConstants.FEATURE_SET_SIZE);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java

License:Apache License

public static ConfusionMatrix readResult(FileSystem fs, Path pathPattern, Configuration conf, Parameters params)
        throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();
    String defaultLabel = params.get("defaultCat");
    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>();

    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        while (reader.next(key, value)) {
            String correctLabel = key.stringAt(1);
            String classifiedLabel = key.stringAt(2);
            Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel);
            if (rowMatrix == null) {
                rowMatrix = new HashMap<String, Integer>();
            }//w  w  w . j  av  a  2s  .  co  m
            Integer count = Double.valueOf(value.get()).intValue();
            rowMatrix.put(classifiedLabel, count);
            confusionMatrix.put(correctLabel, rowMatrix);

        }
    }

    ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel);
    for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) {
        Map<String, Integer> rowMatrix = correctLabelSet.getValue();
        for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) {
            matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey());
            matrix.putCount(correctLabelSet.getKey(), classifiedLabelSet.getKey(),
                    classifiedLabelSet.getValue());
        }
    }
    return matrix;

}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

static LDAState createState(Configuration job) throws IOException {
    String statePath = job.get(STATE_IN_KEY);
    int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
    int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY));
    double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY));

    Path dir = new Path(statePath);
    FileSystem fs = dir.getFileSystem(job);

    DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
    double[] logTotals = new double[numTopics];
    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) {
        Path path = status.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
        while (reader.next(key, value)) {
            int topic = key.getFirst();
            int word = key.getSecond();
            if (word == TOPIC_SUM_KEY) {
                logTotals[topic] = value.get();
                if (Double.isInfinite(value.get())) {
                    throw new IllegalArgumentException();
                }/*  www .  ja v a  2 s . c om*/
            } else if (topic == LOG_LIKELIHOOD_KEY) {
                ll = value.get();
            } else {
                if (!((topic >= 0) && (word >= 0))) {
                    throw new IllegalArgumentException(topic + " " + word);
                }
                if (pWgT.getQuick(topic, word) != 0.0) {
                    throw new IllegalArgumentException();
                }
                pWgT.setQuick(topic, word, value.get());
                if (Double.isInfinite(pWgT.getQuick(topic, word))) {
                    throw new IllegalArgumentException();
                }
            }
        }
        reader.close();
    }

    return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll);
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

private void writeInitialState(Path statePath, int numTopics, int numWords) throws IOException {
    Configuration job = new Configuration();
    FileSystem fs = statePath.getFileSystem(job);

    DoubleWritable v = new DoubleWritable();

    Random random = RandomUtils.getRandom();

    for (int k = 0; k < numTopics; ++k) {
        Path path = new Path(statePath, "part-" + k);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, IntPairWritable.class,
                DoubleWritable.class);

        double total = 0.0; // total number of pseudo counts we made
        for (int w = 0; w < numWords; ++w) {
            IntPairWritable kw = new IntPairWritable(k, w);
            // A small amount of random noise, minimized by having a floor.
            double pseudocount = random.nextDouble() + 1.0E-8;
            total += pseudocount;/*  w  w w  . ja v a 2s  .  c  om*/
            v.set(Math.log(pseudocount));
            writer.append(kw, v);
        }
        IntPairWritable kTsk = new IntPairWritable(k, TOPIC_SUM_KEY);
        v.set(Math.log(total));
        writer.append(kTsk, v);

        writer.close();
    }
}

From source file:org.apache.mahout.clustering.lda.LDADriver.java

License:Apache License

private double findLL(Path statePath, Configuration job) throws IOException {
    FileSystem fs = statePath.getFileSystem(job);

    double ll = 0.0;

    IntPairWritable key = new IntPairWritable();
    DoubleWritable value = new DoubleWritable();
    for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) {
        Path path = status.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
        while (reader.next(key, value)) {
            if (key.getFirst() == LOG_LIKELIHOOD_KEY) {
                ll = value.get();/*from   www .j a  v a 2 s.c  om*/
                break;
            }
        }
        reader.close();
    }

    return ll;
}

From source file:org.apache.mahout.clustering.lda.LDAMapper.java

License:Apache License

@Override
protected void map(WritableComparable<?> key, VectorWritable wordCountsWritable, Context context)
        throws IOException, InterruptedException {
    Vector wordCounts = wordCountsWritable.get();
    LDAInference.InferredDocument doc;/* w ww.j ava 2  s  . c o  m*/
    try {
        doc = infer.infer(wordCounts);
    } catch (ArrayIndexOutOfBoundsException e1) {
        throw new IllegalStateException("This is probably because the --numWords argument is set too small.  \n"
                + "\tIt needs to be >= than the number of words (terms actually) in the corpus and can be \n"
                + "\tlarger if some storage inefficiency can be tolerated.", e1);
    }

    double[] logTotals = new double[state.getNumTopics()];
    Arrays.fill(logTotals, Double.NEGATIVE_INFINITY);

    // Output sufficient statistics for each word. == pseudo-log counts.
    DoubleWritable v = new DoubleWritable();
    for (Iterator<Vector.Element> iter = wordCounts.iterateNonZero(); iter.hasNext();) {
        Vector.Element e = iter.next();
        int w = e.index();

        for (int k = 0; k < state.getNumTopics(); ++k) {
            v.set(doc.phi(k, w) + Math.log(e.get()));

            IntPairWritable kw = new IntPairWritable(k, w);

            // ouput (topic, word)'s logProb contribution
            context.write(kw, v);
            logTotals[k] = LDAUtil.logSum(logTotals[k], v.get());
        }
    }

    // Output the totals for the statistics. This is to make
    // normalizing a lot easier.
    for (int k = 0; k < state.getNumTopics(); ++k) {
        IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY);
        v.set(logTotals[k]);
        assert !Double.isNaN(v.get());
        context.write(kw, v);
    }
    IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY);
    // Output log-likelihoods.
    v.set(doc.getLogLikelihood());
    context.write(llk, v);
}

From source file:org.apache.mahout.clustering.lda.LDAWordTopicMapper.java

License:Apache License

@Override
protected void map(WritableComparable<?> key, VectorWritable wordCountsWritable, Context context)
        throws IOException, InterruptedException {
    Vector wordCounts = wordCountsWritable.get();
    LDAInference.InferredDocument doc;/*from w  w  w.  j av a2s  .  com*/
    try {
        doc = infer.infer(wordCounts);
    } catch (ArrayIndexOutOfBoundsException e1) {
        throw new IllegalStateException("This is probably because the --numWords argument is set too small.  \n"
                + "\tIt needs to be >= than the number of words (terms actually) in the corpus and can be \n"
                + "\tlarger if some storage inefficiency can be tolerated.", e1);
    }

    double[] logTotals = new double[state.getNumTopics()];
    Arrays.fill(logTotals, Double.NEGATIVE_INFINITY);

    // Output sufficient statistics for each word. == pseudo-log counts.
    DoubleWritable v = new DoubleWritable();
    for (Iterator<Vector.Element> iter = wordCounts.iterateNonZero(); iter.hasNext();) {
        Vector.Element e = iter.next();
        int w = e.index();

        for (int k = 0; k < state.getNumTopics(); ++k) {
            v.set(doc.phi(k, w) + Math.log(e.get()));

            IntPairWritable kw = new IntPairWritable(k, w);

            // output (topic, word)'s logProb contribution
            context.write(kw, v);
            logTotals[k] = LDAUtil.logSum(logTotals[k], v.get());
        }
    }

    // Output the totals for the statistics. This is to make
    // normalizing a lot easier.
    for (int k = 0; k < state.getNumTopics(); ++k) {
        IntPairWritable kw = new IntPairWritable(k, LDADriver.TOPIC_SUM_KEY);
        v.set(logTotals[k]);
        assert !Double.isNaN(v.get());
        context.write(kw, v);
    }
    IntPairWritable llk = new IntPairWritable(LDADriver.LOG_LIKELIHOOD_KEY, LDADriver.LOG_LIKELIHOOD_KEY);
    // Output log-likelihoods.
    v.set(doc.getLogLikelihood());
    context.write(llk, v);
}

From source file:org.apache.mahout.ga.watchmaker.OutputUtils.java

License:Apache License

/**
 * Reads back the evaluations.// w w  w.j a  v a2  s.  c o m
 * 
 * @param outpath
 *          output <code>Path</code>
 * @param evaluations
 *          List of evaluations
 */
public static void importEvaluations(FileSystem fs, Configuration conf, Path outpath, List<Double> evaluations)
        throws IOException {
    Sorter sorter = new Sorter(fs, LongWritable.class, DoubleWritable.class, conf);

    // merge and sort the outputs
    Path[] outfiles = listOutputFiles(fs, outpath);
    Path output = new Path(outpath, "output.sorted");
    sorter.merge(outfiles, output);

    // import the evaluations
    LongWritable key = new LongWritable();
    DoubleWritable value = new DoubleWritable();
    Reader reader = new Reader(fs, output, conf);
    try {
        while (reader.next(key, value)) {
            evaluations.add(value.get());
        }
    } finally {
        reader.close();
    }
}