List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:org.apache.mahout.graph.components.FindKTrussesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from www . j a v a 2 s. c om addOutputOption(); addOption("k", "k", "The k parameter of the k-trusses to find."); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); int k = Integer.parseInt(parsedArgs.get("--k")); // extract parameter AtomicInteger currentPhase = new AtomicInteger(); Configuration conf = new Configuration(); Path simplifyInputPath = inputPath; Path simplifyOutputPath = new Path(tempDirPath, String.valueOf(System.currentTimeMillis())); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Simplify the graph first */ SimplifyGraphJob simplifyGraphJob = new SimplifyGraphJob(); simplifyGraphJob.setConf(conf); simplifyGraphJob.run(new String[] { "--input", simplifyInputPath.toString(), "--output", simplifyOutputPath.toString(), "--tempDir", tempDirPath.toString() }); } Path currentTrussesDirPath = simplifyOutputPath; if (shouldRunNextPhase(parsedArgs, currentPhase)) { while (true) { /* * Augment the simplified graph with degrees */ // scatter the edges to each of the vertices and count degree Path augmentInputPath = currentTrussesDirPath; Path augmentOutputPath = new Path(tempDirPath, "augment" + String.valueOf(System.currentTimeMillis())); AugmentGraphWithDegreesJob augmentGraphWithDegreesJob = new AugmentGraphWithDegreesJob(); augmentGraphWithDegreesJob.setConf(conf); augmentGraphWithDegreesJob.run(new String[] { "--input", augmentInputPath.toString(), "--output", augmentOutputPath.toString(), "--tempDir", new Path(tempDirPath, String.valueOf(System.currentTimeMillis())).toString(), }); /* * Enumerate triangles in the graph */ Path enumerateInputPath = augmentOutputPath; // scatter the edges to lower degree vertex and build open triads Path enumerateOutputPath = new Path(tempDirPath, "enumerate" + String.valueOf(System.currentTimeMillis())); EnumerateTrianglesJob enumerateTrianglesJob = new EnumerateTrianglesJob(); enumerateTrianglesJob.setConf(conf); enumerateTrianglesJob.run(new String[] { "--input", enumerateInputPath.toString(), "--output", enumerateOutputPath.toString(), "--tempDir", new Path(tempDirPath, String.valueOf(System.currentTimeMillis())).toString(), }); /* * Drop edges with insufficient support */ Path checkSupportInputPath = enumerateOutputPath; Path checkSupportOutputPath = new Path(tempDirPath, "support" + String.valueOf(System.currentTimeMillis())); Job checkTrianglesForSupport = prepareJob(checkSupportInputPath, checkSupportOutputPath, SequenceFileInputFormat.class, SplitTrianglesToEdgesMapper.class, UndirectedEdge.class, IntWritable.class, DropUnsupportedEdgesReducer.class, UndirectedEdge.class, NullWritable.class, SequenceFileOutputFormat.class); checkTrianglesForSupport.setCombinerClass(IntSumReducer.class); checkTrianglesForSupport.getConfiguration().setInt(K, k); checkTrianglesForSupport.waitForCompletion(true); currentTrussesDirPath = checkSupportOutputPath; long droppedEdges = checkTrianglesForSupport.getCounters().findCounter(Counter.DROPPED_EDGES) .getValue(); log.info("{} edges were dropped", droppedEdges); if (droppedEdges == 0L) { break; } } } Path componentsInputPath = new Path(tempDirPath, "converted" + String.valueOf(System.currentTimeMillis())); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Prepare the input for FindComponents */ Job convertFromat = prepareJob(currentTrussesDirPath, componentsInputPath, SequenceFileInputFormat.class, PrepareInputMapper.class, Vertex.class, FlaggedVertex.class, Reducer.class, Vertex.class, FlaggedVertex.class, SequenceFileOutputFormat.class); convertFromat.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Find the components of the remaining graph */ FindComponentsJob componentsJob = new FindComponentsJob(); componentsJob.setConf(conf); componentsJob.run(new String[] { "--input", componentsInputPath.toString(), "--output", outputPath.toString(), "--tempDir", tempDirPath.toString(), }); } return 0; }
From source file:org.apache.mahout.math.stats.entropy.ConditionalEntropy.java
License:Apache License
/** * Groups and counts by key and value.// w w w .j a va 2s .c o m * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value */ private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException { Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class, GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class, VarIntSumReducer.class, StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class); job.setCombinerClass(VarIntSumReducer.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); }
From source file:org.apache.mahout.math.stats.entropy.Entropy.java
License:Apache License
/** * Groups the items and counts the occur for each of them. * SQL-like: SELECT item, COUNT(*) FROM x GROUP BY item * * @throws IOException/*from ww w . j a v a 2 s .c o m*/ * @throws ClassNotFoundException * @throws InterruptedException */ private void groupAndCount() throws IOException, ClassNotFoundException, InterruptedException { Class<? extends Mapper> mapper = "key".equals(source) ? KeyCounterMapper.class : ValueCounterMapper.class; Job job = prepareJob(getInputPath(), tempPath, SequenceFileInputFormat.class, mapper, Text.class, VarIntWritable.class, VarIntSumReducer.class, Text.class, VarIntWritable.class, SequenceFileOutputFormat.class); job.setCombinerClass(VarIntSumReducer.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams// w w w . java 2 s .c o m */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); job.waitForCompletion(true); return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:org.apache.mahout.vectorizer.collocations.llr.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/* ww w .java 2 s .c o m*/ */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:org.apache.mrql.CrossProductOperation.java
License:Apache License
/** The CrossProduct physical operator (similar to block-nested loop) * @param mx left mapper/* w ww . ja v a 2 s . c om*/ * @param my right mapper * @param reduce_fnc reducer * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X the left source * @param Y the right source (stored in distributed cache) * @param stop_counter optional counter used in repeat operation * @return a new data source that contains the result */ public final static DataSet crossProduct(Tree mx, // left mapper Tree my, // right mapper Tree reduce_fnc, // reducer Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // the left source DataSet Y, // the right source (stored in distributed cache) String stop_counter) // optional counter used in repeat operation throws Exception { DataSet ds = MapOperation.cMap(my, null, null, Y, "-"); conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.mapper", mx.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); PathFilter pf = new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; for (DataSource p : ds.source) { Path path = new Path(p.path); for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf)) DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration()); } ; for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, crossProductMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setNumReduceTasks(0); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job)); }
From source file:org.apache.mrql.GroupByJoinPlan.java
License:Apache License
/** the GroupByJoin operation: * an equi-join combined with a group-by implemented using hashing * @param left_join_key_fnc left join key function from a to k * @param right_join_key_fnc right join key function from b to k * @param left_groupby_fnc left group-by function from a to k1 * @param right_groupby_fnc right group-by function from b to k2 * @param accumulator_fnc accumulator function from (c,(a,b)) to c * @param zero the left zero of accumulator of type c * @param reduce_fnc reduce function from ((k1,k2),c) to d * @param X left data set of type {a} * @param Y right data set of type {b} * @param num_reducers number of reducers * @param n left dimension of the reducer grid * @param m right dimension of the reducer grid * @param stop_counter optional counter used in repeat operation * @return a DataSet that contains the result of type {d} */// ww w.ja v a2 s. co m public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function Tree right_join_key_fnc, // right join key function Tree left_groupby_fnc, // left group-by function Tree right_groupby_fnc, // right group-by function Tree accumulator_fnc, // accumulator function Tree zero, // the left zero of accumulator Tree reduce_fnc, // reduce function DataSet X, // left data set DataSet Y, // right data set int num_reducers, // number of reducers int n, int m, // dimensions of the reducer grid String stop_counter) // optional counter used in repeat operation throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.join.key.left", left_join_key_fnc.toString()); conf.set("mrql.join.key.right", right_join_key_fnc.toString()); conf.set("mrql.groupby.left", left_groupby_fnc.toString()); conf.set("mrql.groupby.right", right_groupby_fnc.toString()); conf.setInt("mrql.m", m); conf.setInt("mrql.n", n); conf.set("mrql.accumulator", accumulator_fnc.toString()); conf.set("mrql.zero", zero.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(GroupByJoinKey.class); job.setJarByClass(GroupByJoinPlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(GroupByJoinPartitioner.class); job.setSortComparatorClass(GroupByJoinSortComparator.class); job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); job.setReducerClass(JoinReducer.class); if (num_reducers > 0) job.setNumReduceTasks(num_reducers); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = false; return new DataSet(s, c, MapReducePlan.outputRecords(job)); }
From source file:org.apache.mrql.JoinOperation.java
License:Apache License
/** The MapReduce2 physical operator (a reduce-side join) * @param mx left mapper function * @param my right mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X left data set/*w w w.j a v a 2 s.c o m*/ * @param Y right data set * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce2(Tree mx, // left mapper function Tree my, // right mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // left data set DataSet Y, // right data set int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper.left", mx.toString()); conf.set("mrql.mapper.right", my.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(JoinKey.class); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerJoinPartitioner.class); job.setSortComparatorClass(MRContainerSortComparator.class); job.setGroupingComparatorClass(MRContainerGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc)) System.out.println("Streamed MapReduce2 reducer"); job.setReducerClass(JoinReducer.class); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.mrql.MapJoinOperation.java
License:Apache License
/** The fragment-replicate join (map-side join) physical operator * @param probe_map_fnc left mapper function * @param built_map_fnc right mapper function * @param reduce_fnc reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param probe_dataset the map source * @param built_dataset stored in distributed cache * @param stop_counter optional counter used in repeat operation * @return a new data source that contains the result *//*w w w . j a v a 2 s .c om*/ public final static DataSet mapJoin(Tree probe_map_fnc, // left mapper function Tree built_map_fnc, // right mapper function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet probe_dataset, // the map source DataSet built_dataset, // stored in distributed cache String stop_counter) // optional counter used in repeat operation throws Exception { DataSet ds = MapOperation.cMap(built_map_fnc, null, null, built_dataset, "-"); conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.inMap.reducer", reduce_fnc.toString()); conf.set("mrql.probe_mapper", probe_map_fnc.toString()); conf.set("mrql.counter", stop_counter); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); setupSplits(new DataSet[] { probe_dataset, built_dataset }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); PathFilter pf = new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; for (DataSource p : ds.source) { // distribute the built dataset Path path = new Path(p.path); for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf)) DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration()); } ; for (DataSource p : probe_dataset.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, mapJoinMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setNumReduceTasks(0); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job)); }
From source file:org.apache.mrql.MapOperation.java
License:Apache License
/** The cMap physical operator * @param map_fnc mapper function//from w w w .j a v a 2 s.c om * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param source input data source * @param stop_counter optional counter used in repeat operation * @return a new data source that contains the result */ public final static DataSet cMap(Tree map_fnc, // mapper function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet source, // input data source String stop_counter) // optional counter used in repeat operation throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper", map_fnc.toString()); conf.set("mrql.counter", stop_counter); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); setupSplits(source, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); for (DataSource p : source.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, cMapMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setNumReduceTasks(0); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job)); }