Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:org.apache.mahout.graph.linkanalysis.RandomWalk.java

License:Apache License

@Override
public final int run(String[] args) throws Exception {
    addOutputOption();//from ww w.ja v  a2 s  .co  m
    addOption("vertices", null, "a text file containing all vertices of the graph (one per line)", true);
    addOption("edges", null, "edges of the graph", true);
    addOption("numIterations", "it", "number of numIterations", String.valueOf(10));
    addOption("stayingProbability", "tp", "probability not to teleport to a random vertex",
            String.valueOf(0.85));

    addSpecificOptions();

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    evaluateSpecificOptions();

    int numIterations = Integer.parseInt(getOption("numIterations"));
    double stayingProbability = Double.parseDouble(getOption("stayingProbability"));

    Preconditions.checkArgument(numIterations > 0);
    Preconditions.checkArgument(stayingProbability > 0.0 && stayingProbability <= 1.0);

    Path adjacencyMatrixPath = getTempPath(AdjacencyMatrixJob.ADJACENCY_MATRIX);
    Path transitionMatrixPath = getTempPath("transitionMatrix");
    Path vertexIndexPath = getTempPath(AdjacencyMatrixJob.VERTEX_INDEX);
    Path numVerticesPath = getTempPath(AdjacencyMatrixJob.NUM_VERTICES);

    /* create the adjacency matrix */
    ToolRunner.run(getConf(), new AdjacencyMatrixJob(), new String[] { "--vertices", getOption("vertices"),
            "--edges", getOption("edges"), "--output", getTempPath().toString() });

    int numVertices = HadoopUtil.readInt(numVerticesPath, getConf());
    Preconditions.checkArgument(numVertices > 0);

    /* transpose and stochastify the adjacency matrix to create the transition matrix */
    Job createTransitionMatrix = prepareJob(adjacencyMatrixPath, transitionMatrixPath, TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    createTransitionMatrix.setCombinerClass(MergeVectorsCombiner.class);
    createTransitionMatrix.getConfiguration().set(NUM_VERTICES_PARAM, String.valueOf(numVertices));
    createTransitionMatrix.getConfiguration().set(STAYING_PROBABILITY_PARAM,
            String.valueOf(stayingProbability));
    boolean succeeded = createTransitionMatrix.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    DistributedRowMatrix transitionMatrix = new DistributedRowMatrix(transitionMatrixPath, getTempPath(),
            numVertices, numVertices);
    transitionMatrix.setConf(getConf());

    Vector ranking = new DenseVector(numVertices).assign(1.0 / numVertices);
    Vector dampingVector = createDampingVector(numVertices, stayingProbability);

    /* power method: iterative transition-matrix times ranking-vector multiplication */
    while (numIterations-- > 0) {
        ranking = transitionMatrix.times(ranking).plus(dampingVector);
    }

    persistVector(getConf(), getTempPath(RANK_VECTOR), ranking);

    Job vertexWithPageRank = prepareJob(vertexIndexPath, getOutputPath(), SequenceFileInputFormat.class,
            RankPerVertexMapper.class, LongWritable.class, DoubleWritable.class, TextOutputFormat.class);
    vertexWithPageRank.getConfiguration().set(RankPerVertexMapper.RANK_PATH_PARAM,
            getTempPath(RANK_VECTOR).toString());
    succeeded = vertexWithPageRank.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    return 0;
}

From source file:org.apache.mahout.graph.preprocessing.AdjacencyMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addOption("vertexIndex", "vi", "vertex index as created by GraphUtils.indexVertices()", true);
    addOption("edges", "e", "edges of the graph", true);
    addOption("numVertices", "nv", "number of vertices in the graph", true);
    addOption("stayingProbability", "sp", "probability not to teleport to another vertex", String.valueOf(1));
    addOption("substochastify", "st", "substochastify the adjacency matrix?", String.valueOf(false));
    addOutputOption();/*from w w  w .ja  v a2s.c o m*/

    Map<String, String> parsedArgs = parseArguments(args);

    Path vertexIndex = new Path(parsedArgs.get("--vertexIndex"));
    Path edges = new Path(parsedArgs.get("--edges"));
    int numVertices = Integer.parseInt(parsedArgs.get("--numVertices"));
    double stayingProbability = Double.parseDouble(parsedArgs.get("--stayingProbability"));
    boolean stochastify = Boolean.parseBoolean(parsedArgs.get("--substochastify"));

    Preconditions.checkArgument(numVertices > 0);
    Preconditions.checkArgument(stayingProbability > 0 && stayingProbability <= 1);

    Job createTransposedAdjacencyMatrix = prepareJob(edges, getTempPath(TRANSPOSED_ADJACENCY_MATRIX),
            VectorizeEdgesMapper.class, IntWritable.class, VectorWritable.class,
            SubstochastifyingVectorSumReducer.class, IntWritable.class, VectorWritable.class);
    createTransposedAdjacencyMatrix.setCombinerClass(VectorSumReducer.class);
    Configuration createAdjacencyMatrixConf = createTransposedAdjacencyMatrix.getConfiguration();
    createAdjacencyMatrixConf.set(NUM_VERTICES_PARAM, String.valueOf(numVertices));
    createAdjacencyMatrixConf.set(VERTEX_INDEX_PARAM, vertexIndex.toString());
    createAdjacencyMatrixConf.set(STAYING_PROBABILITY_PARAM, String.valueOf(stayingProbability));
    createAdjacencyMatrixConf.set(STOCHASTIFY_PARAM, String.valueOf(stochastify));
    createTransposedAdjacencyMatrix.waitForCompletion(true);

    Job transposeTransposedAdjacencyMatrix = prepareJob(getTempPath(TRANSPOSED_ADJACENCY_MATRIX),
            getOutputPath(), TransposeMapper.class, IntWritable.class, VectorWritable.class,
            MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    transposeTransposedAdjacencyMatrix.setCombinerClass(MergeVectorsCombiner.class);
    transposeTransposedAdjacencyMatrix.getConfiguration().set(NUM_VERTICES_PARAM, String.valueOf(numVertices));
    transposeTransposedAdjacencyMatrix.waitForCompletion(true);

    return 0;
}

From source file:org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from ww w.  jav a 2s.c  o m*/
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?",
            String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption("maxObservationsPerRow", null, "sample rows down to this number of entries",
            String.valueOf(DEFAULT_MAX_OBSERVATIONS_PER_ROW));
    addOption("maxObservationsPerColumn", null, "sample columns down to this number of entries",
            String.valueOf(DEFAULT_MAX_OBSERVATIONS_PER_COLUMN));
    addOption("randomSeed", null, "use this seed for sampling", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns;

    if (hasOption("numberOfColumns")) {
        // Number of columns explicitly specified via CLI
        numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
    } else {
        // else get the number of columns by determining the cardinality of a vector in the input matrix
        numberOfColumns = getDimensions(getInputPath());
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
        similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
        similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        // Clear the temp path
        HadoopUtil.delete(getConf(), getTempPath());
        // Clear the output path
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed")) : NO_FIXED_RANDOM_SEED;

    int maxObservationsPerRow = Integer.parseInt(getOption("maxObservationsPerRow"));
    int maxObservationsPerColumn = Integer.parseInt(getOption("maxObservationsPerColumn"));

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    Path observationsPerColumnPath = getTempPath("observationsPerColumn.bin");

    AtomicInteger currentPhase = new AtomicInteger();

    Job countObservations = prepareJob(getInputPath(), getTempPath("notUsed"), CountObservationsMapper.class,
            NullWritable.class, VectorWritable.class, SumObservationsReducer.class, NullWritable.class,
            VectorWritable.class);
    countObservations.setCombinerClass(VectorSumCombiner.class);
    countObservations.getConfiguration().set(OBSERVATIONS_PER_COLUMN_PATH,
            observationsPerColumnPath.toString());
    countObservations.setNumReduceTasks(1);
    countObservations.waitForCompletion(true);

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job normsAndTranspose = prepareJob(getInputPath(), weightsPath, VectorNormMapper.class,
                IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                VectorWritable.class);
        normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
        Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
        normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
        normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
        normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        normsAndTransposeConf.set(OBSERVATIONS_PER_COLUMN_PATH, observationsPerColumnPath.toString());
        normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_ROW, String.valueOf(maxObservationsPerRow));
        normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_COLUMN, String.valueOf(maxObservationsPerColumn));
        normsAndTransposeConf.set(RANDOM_SEED, String.valueOf(randomSeed));

        boolean succeeded = normsAndTranspose.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class,
                IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class,
                VectorWritable.class);
        pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
        pairwiseConf.set(NORMS_PATH, normsPath.toString());
        pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
        boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class,
                IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class,
                IntWritable.class, VectorWritable.class);
        asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        boolean succeeded = asMatrix.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:org.apache.mahout.math.hadoop.stats.BasicStats.java

License:Apache License

private static VarianceTotals computeVarianceTotals(Path input, Path output, Configuration baseConf)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
            StandardDeviationCalculatorMapper.class, IntWritable.class, DoubleWritable.class,
            StandardDeviationCalculatorReducer.class, IntWritable.class, DoubleWritable.class,
            SequenceFileOutputFormat.class, conf);
    HadoopUtil.delete(conf, output);/*from   w  ww  . j a  v a2  s  .c o m*/
    job.setCombinerClass(StandardDeviationCalculatorReducer.class);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    // Now extract the computed sum
    Path filesPattern = new Path(output, "part-*");
    double sumOfSquares = 0;
    double sum = 0;
    double totalCount = 0;
    for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern,
            PathType.GLOB, null, null, true, conf)) {

        int key = ((IntWritable) record.getFirst()).get();
        if (key == StandardDeviationCalculatorMapper.SUM_OF_SQUARES.get()) {
            sumOfSquares += ((DoubleWritable) record.getSecond()).get();
        } else if (key == StandardDeviationCalculatorMapper.TOTAL_COUNT.get()) {
            totalCount += ((DoubleWritable) record.getSecond()).get();
        } else if (key == StandardDeviationCalculatorMapper.SUM.get()) {
            sum += ((DoubleWritable) record.getSecond()).get();
        }
    }

    VarianceTotals varianceTotals = new VarianceTotals();
    varianceTotals.setSum(sum);
    varianceTotals.setSumOfSquares(sumOfSquares);
    varianceTotals.setTotalCount(totalCount);

    return varianceTotals;
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath,
        int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks,
        boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    // MultipleOutputs
    // .addNamedOutput(oldApiJob,
    // QJob.OUTPUT_QHAT,
    // org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
    // SplitPartitionedWritable.class,
    // DenseBlockWritable.class);
    ////from ww  w  .  j a  v  a  2  s .co m
    // MultipleOutputs
    // .addNamedOutput(oldApiJob,
    // QJob.OUTPUT_RHAT,
    // org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
    // SplitPartitionedWritable.class,
    // VectorWritable.class);

    Job job = new Job(oldApiJob);
    job.setJobName("ABt-job");
    job.setJarByClass(ABtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputAPaths);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(SplitPartitionedWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(SplitPartitionedWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(ABtMapper.class);
    job.setCombinerClass(BtJob.OuterProductCombiner.class);
    job.setReducerClass(QRReducer.class);

    job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
    job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight);
    job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
    job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
    job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());

    // number of reduce tasks doesn't matter. we don't actually
    // send anything to reducers.

    job.setNumReduceTasks(numReduceTasks);

    // broadcast Bt files if required.
    if (broadcastBInput) {
        job.getConfiguration().set(PROP_BT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(inputBtGlob);
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), conf);
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("ABt job unsuccessful.");
    }

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath,
        int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast,
        Class<? extends Writable> labelClass, boolean outputBBtProducts)
        throws ClassNotFoundException, InterruptedException, IOException {

    JobConf oldApiJob = new JobConf(conf);

    MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
            labelClass, VectorWritable.class);

    if (outputBBtProducts) {
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        /*/*from  www. j a  v a2 s .com*/
         * MAHOUT-1067: if we are asked to output BBT products then named vector
         * names should be propagated to Q too so that UJob could pick them up
         * from there.
         */
        oldApiJob.setBoolean(PROP_NV, true);
    }
    if (xiPath != null) {
        // compute pca -related stuff as well
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
        MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB,
                org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class,
                VectorWritable.class);
    }

    /*
     * HACK: we use old api multiple outputs since they are not available in the
     * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
     * can use new api interfaces.
     */

    Job job = new Job(oldApiJob);
    job.setJobName("Bt-job");
    job.setJarByClass(BtJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathA);
    if (minSplitSize > 0) {
        FileInputFormat.setMinInputSplitSize(job, minSplitSize);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // WARN: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);

    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SparseRowBlockWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(BtMapper.class);
    job.setCombinerClass(OuterProductCombiner.class);
    job.setReducerClass(OuterProductReducer.class);

    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(QJob.PROP_P, p);
    job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
    job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts);
    job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);

    job.setNumReduceTasks(numReduceTasks);

    /*
     * PCA-related options, MAHOUT-817
     */
    if (xiPath != null) {
        job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
    }

    /*
     * we can broadhast Rhat files since all of them are reuqired by each job,
     * but not Q files which correspond to splits of A (so each split of A will
     * require only particular Q file, each time different one).
     */

    if (broadcast) {
        job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");

        FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
        FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
        if (fstats != null) {
            for (FileStatus fstat : fstats) {
                /*
                 * new api is not enabled yet in our dependencies at this time, still
                 * using deprecated one
                 */
                DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration());
            }
        }
    }

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
        throw new IOException("Bt job unsuccessful.");
    }
}

From source file:org.apache.mahout.math.hadoop.TimesSquaredJob.java

License:Apache License

public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim,
        Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass,
        Class<? extends VectorSummingReducer> redClass) throws IOException {

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    outputVectorPathBase = fs.makeQualified(outputVectorPathBase);

    long now = System.nanoTime();
    Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);

    SequenceFile.Writer inputVectorPathWriter = null;

    try {/*  w w  w . j a  v a  2 s. c  o m*/
        inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class,
                VectorWritable.class);
        inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
    } finally {
        Closeables.close(inputVectorPathWriter, false);
    }

    URI ivpURI = inputVectorPath.toUri();
    DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf);

    Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
            SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass,
            NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
    job.setCombinerClass(redClass);
    job.setJobName("TimesSquaredJob: " + matrixInputPath);

    Configuration conf = job.getConfiguration();
    conf.set(INPUT_VECTOR, ivpURI.toString());
    conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
    conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);

    return job;
}

From source file:org.apache.mahout.math.hadoop.TransposeJob.java

License:Apache License

public static Job buildTransposeJob(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath,
        int numInputRows) throws IOException {

    Job job = HadoopUtil.prepareJob(matrixInputPath, matrixOutputPath, SequenceFileInputFormat.class,
            TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
    job.setCombinerClass(MergeVectorsCombiner.class);
    job.getConfiguration().setInt(TransposeMapper.NEW_NUM_COLS_PARAM, numInputRows);

    job.setJobName("TransposeJob: " + matrixInputPath);

    return job;/*from w ww  . j  ava  2s .c o m*/
}

From source file:org.apache.mahout.math.stats.entropy.ConditionalEntropy.java

License:Apache License

/**
 * Groups and counts by key and value./*from   w w  w  . j av a2 s  . c  o  m*/
 * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value
 */
private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException {

    Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class,
            GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class,
            VarIntSumReducer.class, StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class);
    job.setCombinerClass(VarIntSumReducer.class);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
            .getValue();

}