List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:org.apache.mahout.graph.linkanalysis.RandomWalk.java
License:Apache License
@Override public final int run(String[] args) throws Exception { addOutputOption();//from ww w.ja v a2 s .co m addOption("vertices", null, "a text file containing all vertices of the graph (one per line)", true); addOption("edges", null, "edges of the graph", true); addOption("numIterations", "it", "number of numIterations", String.valueOf(10)); addOption("stayingProbability", "tp", "probability not to teleport to a random vertex", String.valueOf(0.85)); addSpecificOptions(); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } evaluateSpecificOptions(); int numIterations = Integer.parseInt(getOption("numIterations")); double stayingProbability = Double.parseDouble(getOption("stayingProbability")); Preconditions.checkArgument(numIterations > 0); Preconditions.checkArgument(stayingProbability > 0.0 && stayingProbability <= 1.0); Path adjacencyMatrixPath = getTempPath(AdjacencyMatrixJob.ADJACENCY_MATRIX); Path transitionMatrixPath = getTempPath("transitionMatrix"); Path vertexIndexPath = getTempPath(AdjacencyMatrixJob.VERTEX_INDEX); Path numVerticesPath = getTempPath(AdjacencyMatrixJob.NUM_VERTICES); /* create the adjacency matrix */ ToolRunner.run(getConf(), new AdjacencyMatrixJob(), new String[] { "--vertices", getOption("vertices"), "--edges", getOption("edges"), "--output", getTempPath().toString() }); int numVertices = HadoopUtil.readInt(numVerticesPath, getConf()); Preconditions.checkArgument(numVertices > 0); /* transpose and stochastify the adjacency matrix to create the transition matrix */ Job createTransitionMatrix = prepareJob(adjacencyMatrixPath, transitionMatrixPath, TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); createTransitionMatrix.setCombinerClass(MergeVectorsCombiner.class); createTransitionMatrix.getConfiguration().set(NUM_VERTICES_PARAM, String.valueOf(numVertices)); createTransitionMatrix.getConfiguration().set(STAYING_PROBABILITY_PARAM, String.valueOf(stayingProbability)); boolean succeeded = createTransitionMatrix.waitForCompletion(true); if (!succeeded) { return -1; } DistributedRowMatrix transitionMatrix = new DistributedRowMatrix(transitionMatrixPath, getTempPath(), numVertices, numVertices); transitionMatrix.setConf(getConf()); Vector ranking = new DenseVector(numVertices).assign(1.0 / numVertices); Vector dampingVector = createDampingVector(numVertices, stayingProbability); /* power method: iterative transition-matrix times ranking-vector multiplication */ while (numIterations-- > 0) { ranking = transitionMatrix.times(ranking).plus(dampingVector); } persistVector(getConf(), getTempPath(RANK_VECTOR), ranking); Job vertexWithPageRank = prepareJob(vertexIndexPath, getOutputPath(), SequenceFileInputFormat.class, RankPerVertexMapper.class, LongWritable.class, DoubleWritable.class, TextOutputFormat.class); vertexWithPageRank.getConfiguration().set(RankPerVertexMapper.RANK_PATH_PARAM, getTempPath(RANK_VECTOR).toString()); succeeded = vertexWithPageRank.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }
From source file:org.apache.mahout.graph.preprocessing.AdjacencyMatrixJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOption("vertexIndex", "vi", "vertex index as created by GraphUtils.indexVertices()", true); addOption("edges", "e", "edges of the graph", true); addOption("numVertices", "nv", "number of vertices in the graph", true); addOption("stayingProbability", "sp", "probability not to teleport to another vertex", String.valueOf(1)); addOption("substochastify", "st", "substochastify the adjacency matrix?", String.valueOf(false)); addOutputOption();/*from w w w .ja v a2s.c o m*/ Map<String, String> parsedArgs = parseArguments(args); Path vertexIndex = new Path(parsedArgs.get("--vertexIndex")); Path edges = new Path(parsedArgs.get("--edges")); int numVertices = Integer.parseInt(parsedArgs.get("--numVertices")); double stayingProbability = Double.parseDouble(parsedArgs.get("--stayingProbability")); boolean stochastify = Boolean.parseBoolean(parsedArgs.get("--substochastify")); Preconditions.checkArgument(numVertices > 0); Preconditions.checkArgument(stayingProbability > 0 && stayingProbability <= 1); Job createTransposedAdjacencyMatrix = prepareJob(edges, getTempPath(TRANSPOSED_ADJACENCY_MATRIX), VectorizeEdgesMapper.class, IntWritable.class, VectorWritable.class, SubstochastifyingVectorSumReducer.class, IntWritable.class, VectorWritable.class); createTransposedAdjacencyMatrix.setCombinerClass(VectorSumReducer.class); Configuration createAdjacencyMatrixConf = createTransposedAdjacencyMatrix.getConfiguration(); createAdjacencyMatrixConf.set(NUM_VERTICES_PARAM, String.valueOf(numVertices)); createAdjacencyMatrixConf.set(VERTEX_INDEX_PARAM, vertexIndex.toString()); createAdjacencyMatrixConf.set(STAYING_PROBABILITY_PARAM, String.valueOf(stayingProbability)); createAdjacencyMatrixConf.set(STOCHASTIFY_PARAM, String.valueOf(stochastify)); createTransposedAdjacencyMatrix.waitForCompletion(true); Job transposeTransposedAdjacencyMatrix = prepareJob(getTempPath(TRANSPOSED_ADJACENCY_MATRIX), getOutputPath(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); transposeTransposedAdjacencyMatrix.setCombinerClass(MergeVectorsCombiner.class); transposeTransposedAdjacencyMatrix.getConfiguration().set(NUM_VERTICES_PARAM, String.valueOf(numVertices)); transposeTransposedAdjacencyMatrix.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w. jav a 2s.c o m*/ addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix", false); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false)); addOption("threshold", "tr", "discard row pairs with a similarity value below this", false); addOption("maxObservationsPerRow", null, "sample rows down to this number of entries", String.valueOf(DEFAULT_MAX_OBSERVATIONS_PER_ROW)); addOption("maxObservationsPerColumn", null, "sample columns down to this number of entries", String.valueOf(DEFAULT_MAX_OBSERVATIONS_PER_COLUMN)); addOption("randomSeed", null, "use this seed for sampling", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns; if (hasOption("numberOfColumns")) { // Number of columns explicitly specified via CLI numberOfColumns = Integer.parseInt(getOption("numberOfColumns")); } else { // else get the number of columns by determining the cardinality of a vector in the input matrix numberOfColumns = getDimensions(getInputPath()); } String similarityClassnameArg = getOption("similarityClassname"); String similarityClassname; try { similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname(); } catch (IllegalArgumentException iae) { similarityClassname = similarityClassnameArg; } // Clear the output and temp paths if the overwrite option has been set if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { // Clear the temp path HadoopUtil.delete(getConf(), getTempPath()); // Clear the output path HadoopUtil.delete(getConf(), getOutputPath()); } int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow")); boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD; long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed")) : NO_FIXED_RANDOM_SEED; int maxObservationsPerRow = Integer.parseInt(getOption("maxObservationsPerRow")); int maxObservationsPerColumn = Integer.parseInt(getOption("maxObservationsPerColumn")); Path weightsPath = getTempPath("weights"); Path normsPath = getTempPath("norms.bin"); Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin"); Path maxValuesPath = getTempPath("maxValues.bin"); Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity"); Path observationsPerColumnPath = getTempPath("observationsPerColumn.bin"); AtomicInteger currentPhase = new AtomicInteger(); Job countObservations = prepareJob(getInputPath(), getTempPath("notUsed"), CountObservationsMapper.class, NullWritable.class, VectorWritable.class, SumObservationsReducer.class, NullWritable.class, VectorWritable.class); countObservations.setCombinerClass(VectorSumCombiner.class); countObservations.getConfiguration().set(OBSERVATIONS_PER_COLUMN_PATH, observationsPerColumnPath.toString()); countObservations.setNumReduceTasks(1); countObservations.waitForCompletion(true); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job normsAndTranspose = prepareJob(getInputPath(), weightsPath, VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class); Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration(); normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold)); normsAndTransposeConf.set(NORMS_PATH, normsPath.toString()); normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString()); normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname); normsAndTransposeConf.set(OBSERVATIONS_PER_COLUMN_PATH, observationsPerColumnPath.toString()); normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_ROW, String.valueOf(maxObservationsPerRow)); normsAndTransposeConf.set(MAX_OBSERVATIONS_PER_COLUMN, String.valueOf(maxObservationsPerColumn)); normsAndTransposeConf.set(RANDOM_SEED, String.valueOf(randomSeed)); boolean succeeded = normsAndTranspose.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class, IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class); pairwiseSimilarity.setCombinerClass(VectorSumReducer.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(THRESHOLD, String.valueOf(threshold)); pairwiseConf.set(NORMS_PATH, normsPath.toString()); pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString()); pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity); boolean succeeded = pairwiseSimilarity.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class, IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class, VectorWritable.class); asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); boolean succeeded = asMatrix.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:org.apache.mahout.math.hadoop.stats.BasicStats.java
License:Apache License
private static VarianceTotals computeVarianceTotals(Path input, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StandardDeviationCalculatorMapper.class, IntWritable.class, DoubleWritable.class, StandardDeviationCalculatorReducer.class, IntWritable.class, DoubleWritable.class, SequenceFileOutputFormat.class, conf); HadoopUtil.delete(conf, output);/*from w ww . j a v a2 s .c o m*/ job.setCombinerClass(StandardDeviationCalculatorReducer.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } // Now extract the computed sum Path filesPattern = new Path(output, "part-*"); double sumOfSquares = 0; double sum = 0; double totalCount = 0; for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) { int key = ((IntWritable) record.getFirst()).get(); if (key == StandardDeviationCalculatorMapper.SUM_OF_SQUARES.get()) { sumOfSquares += ((DoubleWritable) record.getSecond()).get(); } else if (key == StandardDeviationCalculatorMapper.TOTAL_COUNT.get()) { totalCount += ((DoubleWritable) record.getSecond()).get(); } else if (key == StandardDeviationCalculatorMapper.SUM.get()) { sum += ((DoubleWritable) record.getSecond()).get(); } } VarianceTotals varianceTotals = new VarianceTotals(); varianceTotals.setSum(sum); varianceTotals.setSumOfSquares(sumOfSquares); varianceTotals.setTotalCount(totalCount); return varianceTotals; }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_QHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // DenseBlockWritable.class); ////from ww w . j a v a 2 s .co m // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_RHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // VectorWritable.class); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setCombinerClass(BtJob.OuterProductCombiner.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); // number of reduce tasks doesn't matter. we don't actually // send anything to reducers. job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), conf); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath, int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast, Class<? extends Writable> labelClass, boolean outputBBtProducts) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, labelClass, VectorWritable.class); if (outputBBtProducts) { MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); /*/*from www. j a v a2 s .com*/ * MAHOUT-1067: if we are asked to output BBT products then named vector * names should be propagated to Q too so that UJob could pick them up * from there. */ oldApiJob.setBoolean(PROP_NV, true); } if (xiPath != null) { // compute pca -related stuff as well MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } /* * HACK: we use old api multiple outputs since they are not available in the * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we * can use new api interfaces. */ Job job = new Job(oldApiJob); job.setJobName("Bt-job"); job.setJarByClass(BtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathA); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); // WARN: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(QJob.PROP_P, p); job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString()); job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight); job.setNumReduceTasks(numReduceTasks); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); } /* * we can broadhast Rhat files since all of them are reuqired by each job, * but not Q files which correspond to splits of A (so each split of A will * require only particular Q file, each time different one). */ if (broadcast) { job.getConfiguration().set(PROP_RHAT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf); FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*")); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Bt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.TimesSquaredJob.java
License:Apache License
public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try {/* w w w . j a v a 2 s. c o m*/ inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf); Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
From source file:org.apache.mahout.math.hadoop.TransposeJob.java
License:Apache License
public static Job buildTransposeJob(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath, int numInputRows) throws IOException { Job job = HadoopUtil.prepareJob(matrixInputPath, matrixOutputPath, SequenceFileInputFormat.class, TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(MergeVectorsCombiner.class); job.getConfiguration().setInt(TransposeMapper.NEW_NUM_COLS_PARAM, numInputRows); job.setJobName("TransposeJob: " + matrixInputPath); return job;/*from w ww . j ava 2s .c o m*/ }
From source file:org.apache.mahout.math.stats.entropy.ConditionalEntropy.java
License:Apache License
/** * Groups and counts by key and value./*from w w w . j av a2 s . c o m*/ * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value */ private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException { Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class, GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class, VarIntSumReducer.class, StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class); job.setCombinerClass(VarIntSumReducer.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); }