List of usage examples for org.apache.hadoop.mapred JobConf setCombinerClass
public void setCombinerClass(Class<? extends Reducer> theClass)
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ //conf.set("mapred.task.timeout", "600000"); //conf.set("mapred.map.tasks.speculative.execution", "false"); /* change to 128mb */ //conf.set("dfs.block.size", "134217728"); /*/* w w w. ja va2 s . c o m*/ * use compression */ /* conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); */ /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 80; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "4000"); conf.set("mapred.job.reduce.memory.mb", "4000"); conf.set("dfs.replication", "1"); /* * reduce I/O load */ conf.set("mapred.child.java.opts", "-Xmx1400M"); conf.set("io.sort.mb", "300"); conf.set("io.sort.factor", "30"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1WithFeatures.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); /* set the new defined type to be used */ conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); if (args.length > 3) { conf.setInt("threshold", Integer.parseInt(args[3])); }/*from w ww.j a v a2 s . c om*/ /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 200; /* * Number of map tasks to deploy on each machine. 0.5 to 2 * * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* * The default number of map tasks per job. Typically set to a prime * several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 20; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "4000"); conf.set("mapred.job.reduce.memory.mb", "4000"); conf.set("dfs.replication", "1"); /* * reduce I/O load */ conf.set("mapred.child.java.opts", "-Xmx1400M"); conf.set("io.sort.mb", "300"); conf.set("io.sort.factor", "30"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCountsLog.java
License:Apache License
/** * The reducer step will sum all float values, i.e. the * weight for any (word1,word2) pair sharing a feature. *//*from w w w . ja v a2 s . c o m*/ public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(FloatWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.TotalWords.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/*from ww w. j a v a2 s .c o m*/ }
From source file:mapreduce.BigramCount.java
License:Apache License
/** * Runs this tool.//w w w .j av a 2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = 1;//Integer.parseInt(args[2]); int reduceTasks = 1;//Integer.parseInt(args[3]); sLogger.info("Tool: BigramCount"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(BigramCount.class); conf.setJobName("BigramCount"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); /** * Note that these must match the Class arguments given in the mapper */ conf.setOutputKeyClass(WordPair.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); conf.setPartitionerClass(MyPartitioner.class); conf.setCombinerClass(MyReducer.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(outputDir.toUri(), conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:mapreduce2.SpeciesDriver.java
public static void main(String[] args) throws Exception { JobClient client = new JobClient(); JobConf conf = new JobConf(SpeciesDriver.class); conf.setJobName("Page-rank Species Graph Builder"); final File f = new File(SpeciesDriver.class.getProtectionDomain().getCodeSource().getLocation().getPath()); String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/InputFiles/species_medium.txt"; String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result"; FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); //conf.setOutputKeyClass(Text.class); //conf.setOutputValueClass(Text.class); conf.setMapperClass(SpeciesGraphBuilderMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); //conf.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); //conf.setOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat.class); conf.setReducerClass(SpeciesGraphBuilderReducer.class); //conf.setCombinerClass(SpeciesGraphBuilderReducer.class); //conf.setInputPath(new Path("graph1")); //conf.setOutputPath(new Path("graph2")); // take the input and output from the command line FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); client.setConf(conf);//from w w w . ja v a2 s . c o m try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result/part-00000"; for (int i = 0; i < 15; i++) { client = new JobClient(); conf = new JobConf(SpeciesDriver.class); conf.setJobName("Species Iter"); int count = i + 1; outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result" + count; conf.setNumReduceTasks(5); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); conf.setMapperClass(SpeciesIterMapper2.class); conf.setReducerClass(SpeciesIterReducer2.class); conf.setCombinerClass(SpeciesIterReducer2.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } inFiles = outFiles; } //Viewer client = new JobClient(); conf = new JobConf(SpeciesDriver.class); conf.setJobName("Species Viewer"); conf.setOutputKeyClass(FloatWritable.class); conf.setOutputValueClass(Text.class); inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result15/part-00000"; outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/ResultFinal"; FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); conf.setMapperClass(SpeciesViewerMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } }
From source file:mapreducejava.SpeciesDriver.java
public static void main(String[] args) throws Exception { JobClient client = new JobClient(); JobConf conf = new JobConf(SpeciesDriver.class); conf.setJobName("Page-rank Species Graph Builder"); final File f = new File(SpeciesDriver.class.getProtectionDomain().getCodeSource().getLocation().getPath()); String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/InputFiles/species_medium.txt"; String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result"; FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); //conf.setOutputKeyClass(Text.class); //conf.setOutputValueClass(Text.class); conf.setMapperClass(SpeciesGraphBuilderMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); //conf.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); //conf.setOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat.class); conf.setReducerClass(SpeciesGraphBuilderReducer.class); //conf.setCombinerClass(SpeciesGraphBuilderReducer.class); //conf.setInputPath(new Path("graph1")); //conf.setOutputPath(new Path("graph2")); // take the input and output from the command line FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); client.setConf(conf);/*from w w w . j av a2 s. c o m*/ try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result/part-00000"; for (int i = 0; i < 25; i++) { client = new JobClient(); conf = new JobConf(SpeciesDriver.class); conf.setJobName("Species Iter"); int count = i + 1; outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result" + count; conf.setNumReduceTasks(5); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); conf.setMapperClass(SpeciesIterMapper2.class); conf.setReducerClass(SpeciesIterReducer2.class); conf.setCombinerClass(SpeciesIterReducer2.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } inFiles = outFiles; } //Viewer client = new JobClient(); conf = new JobConf(SpeciesDriver.class); conf.setJobName("Species Viewer"); conf.setOutputKeyClass(FloatWritable.class); conf.setOutputValueClass(Text.class); inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/Result25/part-00000"; outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outputFiles/ResultFinal"; FileInputFormat.setInputPaths(conf, new Path(inFiles)); FileOutputFormat.setOutputPath(conf, new Path(outFiles)); conf.setMapperClass(SpeciesViewerMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } }
From source file:map_reduce.MapReduce_OptimizedBrandesAdditions_DO_JUNG.java
License:Open Source License
@SuppressWarnings("deprecation") @Override// ww w .j a v a 2 s . c o m public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage:\n"); System.exit(1); } // Job job = new Job(super.getConf()); // READ IN ALL COMMAND LINE ARGUMENTS // EXAMPLE: // hadoop jar MapReduce_OptimizedBrandesAdditions_DO_JUNG.jar // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar // -Dmapred.job.map.memory.mb=4096 // -Dmapred.job.reduce.memory.mb=4096 // -Dmapred.child.java.opts=-Xmx3500m // -Dmapreduce.task.timeout=60000000 // -Dmapreduce.job.queuename=QUEUENAME // input_iterbrandes_additions_nocomb_10k_1 output_iterbrandes_additions_nocomb_10k_1 // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/ int m = -1; // input path to use on hdfs Path inputPath = new Path(args[++m]); // output path to use on hdfs Path outputPath = new Path(args[++m]); // number of Mappers to split the sources: e.g., 1, 10, 100 etc. // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number. int numOfMaps = Integer.parseInt(args[++m]); // number of Reducers to collect the output int numOfReduce = Integer.parseInt(args[++m]); // Number of vertices in graph int N = Integer.parseInt(args[++m]); // Number of edges in graph int M = Integer.parseInt(args[++m]); // Graph file (edge list, tab delimited) (full path) String graph = args[++m]; // File with edges to be added (tab delimited) (full path) // Note: this version handles only edges between existing vertices in the graph. String random_edges = args[++m]; // Number of random edges added int re = Integer.parseInt(args[++m]); // Experiment iteration (in case of multiple experiments) int iter = Integer.parseInt(args[++m]); // Use combiner or not (true/false) Boolean comb = Boolean.valueOf(args[++m]); // Output path for file with stats String statsoutputpath = args[++m]; // Output path for file with final betweenness values String betoutputpath = args[++m]; // BEGIN INITIALIZATION JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesAdditions_DO_JUNG.class); FileSystem fs = FileSystem.get(conf); String setup = "_additions_edges" + re + "_maps" + numOfMaps + "_comb" + comb; conf.setJobName("OptimizedBrandesAdditionsDOJung_" + graph + setup + "_" + iter); conf.set("HDFS_GRAPH", graph + setup); conf.set("HDFS_Random_Edges", random_edges + setup); conf.set("output", outputPath.getName()); conf.set("setup", setup); // CREATE INPUT FILES FOR MAPPERS int numOfTasksperMap = (int) Math.ceil(N / numOfMaps); //generate an input file for each map task for (int i = 0; i < numOfMaps - 1; i++) { Path file = new Path(inputPath, "part-r-" + i); IntWritable start = new IntWritable(i * numOfTasksperMap); IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end); } // last mapper takes what is left Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1)); IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap); IntWritable end = new IntWritable(N - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end); // COPY FILES TO MAPPERS System.out.println("Copying graph to cache"); String LOCAL_GRAPH = graph; Path hdfsPath = new Path(graph + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); System.out.println("Copying random edges to cache"); String LOCAL_Random_Edges = random_edges; hdfsPath = new Path(random_edges + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(IterBrandesMapper.class); conf.setNumMapTasks(numOfMaps); if (comb) conf.setCombinerClass(IterBrandesReducer.class); conf.setReducerClass(IterBrandesReducer.class); conf.setNumReduceTasks(numOfReduce); // turn off speculative execution, because DFS doesn't handle multiple writers to the same file. conf.setSpeculativeExecution(false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); // conf.set("mapred.job.name", "APS-" + outputPath.getName()); conf.setNumTasksToExecutePerJvm(-1); // JVM reuse System.out.println("Starting the execution...! Pray!! \n"); long time1 = System.nanoTime(); RunningJob rj = JobClient.runJob(conf); long time2 = System.nanoTime(); // READ OUTPUT FILES System.out.println("\nFinished and now reading/writing Betweenness Output...\n"); // Assuming 1 reducer. Path inFile = new Path(outputPath, "part-00000"); IntWritable id = new IntWritable(); DoubleWritable betweenness = new DoubleWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf); FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter)); try { int i = 0; for (; i < (N + M + re); i++) { reader.next(id, betweenness); fw.write(id + "\t" + betweenness + "\n"); fw.flush(); } } finally { reader.close(); fw.close(); } System.out.println("\nWriting times Output...\n"); fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter)); fw.write("Total-time:\t" + (time2 - time1) + "\n"); fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_MAPS") + "\n"); fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_REDUCES") + "\n"); fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("CPU_MILLISECONDS") + "\n"); fw.write("total-gc-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS") + "\n"); fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("PHYSICAL_MEMORY_BYTES") + "\n"); fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("VIRTUAL_MEMORY_BYTES") + "\n"); fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes") + "\n"); fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n"); fw.flush(); try { Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator(); while (counters.hasNext()) { Counter cc = counters.next(); fw.write(cc.getName() + "\t" + cc.getCounter() + "\n"); fw.flush(); } } finally { fw.close(); } return 0; }
From source file:map_reduce.MapReduce_OptimizedBrandesDeletions_DO_JUNG.java
License:Open Source License
@SuppressWarnings("deprecation") @Override//from w w w. j a v a 2 s . c o m public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage:\n"); System.exit(1); } // Job job = new Job(super.getConf()); // READ IN ALL COMMAND LINE ARGUMENTS // EXAMPLE: // hadoop jar MapReduce_OptimizedBrandesDeletions_DO_JUNG.jar // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar // -Dmapred.job.map.memory.mb=4096 // -Dmapred.job.reduce.memory.mb=4096 // -Dmapred.child.java.opts=-Xmx3500m // -Dmapreduce.task.timeout=60000000 // -Dmapreduce.job.queuename=QUEUENAME // input_iterbrandes_deletions_nocomb_10k_1 output_iterbrandes_deletions_nocomb_10k_1 // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/ int m = -1; // input path to use on hdfs Path inputPath = new Path(args[++m]); // output path to use on hdfs Path outputPath = new Path(args[++m]); // number of Mappers to split the sources: e.g., 1, 10, 100 etc. // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number. int numOfMaps = Integer.parseInt(args[++m]); // number of Reducers to collect the output int numOfReduce = Integer.parseInt(args[++m]); // Number of vertices in graph int N = Integer.parseInt(args[++m]); // Number of edges in graph int M = Integer.parseInt(args[++m]); // Graph file (edge list, tab delimited) (full path) String graph = args[++m]; // File with edges to be added (tab delimited) (full path) // Note: this version handles only edges between existing vertices in the graph. String random_edges = args[++m]; // Number of random edges added int re = Integer.parseInt(args[++m]); // Experiment iteration (in case of multiple experiments) int iter = Integer.parseInt(args[++m]); // Use combiner or not (true/false) Boolean comb = Boolean.valueOf(args[++m]); // Output path for file with stats String statsoutputpath = args[++m]; // Output path for file with final betweenness values String betoutputpath = args[++m]; // BEGIN INITIALIZATION JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesDeletions_DO_JUNG.class); FileSystem fs = FileSystem.get(conf); String setup = "_deletions_edges" + re + "_maps" + numOfMaps + "_comb" + comb; conf.setJobName("OptimizedBrandesDeletionsDOJung_" + graph + setup + "_" + iter); conf.set("HDFS_GRAPH", graph + setup); conf.set("HDFS_Random_Edges", random_edges + setup); conf.set("output", outputPath.getName()); conf.set("setup", setup); // CREATE INPUT FILES FOR MAPPERS int numOfTasksperMap = (int) Math.ceil(N / numOfMaps); //generate an input file for each map task for (int i = 0; i < numOfMaps - 1; i++) { Path file = new Path(inputPath, "part-r-" + i); IntWritable start = new IntWritable(i * numOfTasksperMap); IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end); } // last mapper takes what is left Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1)); IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap); IntWritable end = new IntWritable(N - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end); // COPY FILES TO MAPPERS System.out.println("Copying graph to cache"); String LOCAL_GRAPH = graph; Path hdfsPath = new Path(graph + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); System.out.println("Copying random edges to cache"); String LOCAL_Random_Edges = random_edges; hdfsPath = new Path(random_edges + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(IterBrandesMapper.class); conf.setNumMapTasks(numOfMaps); if (comb) conf.setCombinerClass(IterBrandesReducer.class); conf.setReducerClass(IterBrandesReducer.class); conf.setNumReduceTasks(numOfReduce); // turn off speculative execution, because DFS doesn't handle multiple writers to the same file. conf.setSpeculativeExecution(false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); // conf.set("mapred.job.name", "APS-" + outputPath.getName()); conf.setNumTasksToExecutePerJvm(-1); // JVM reuse System.out.println("Starting the execution...! Pray!! \n"); long time1 = System.nanoTime(); RunningJob rj = JobClient.runJob(conf); long time2 = System.nanoTime(); // READ OUTPUT FILES System.out.println("\nFinished and now reading/writing Betweenness Output...\n"); // Assuming 1 reducer. Path inFile = new Path(outputPath, "part-00000"); IntWritable id = new IntWritable(); DoubleWritable betweenness = new DoubleWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf); FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter)); try { int i = 0; for (; i < (N + (M - re)); i++) { reader.next(id, betweenness); fw.write(id + "\t" + betweenness + "\n"); fw.flush(); } } finally { reader.close(); fw.close(); } System.out.println("\nWriting times Output...\n"); fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter)); fw.write("Total-time:\t" + (time2 - time1) + "\n"); fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_MAPS") + "\n"); fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_REDUCES") + "\n"); fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("CPU_MILLISECONDS") + "\n"); fw.write("total-gc-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS") + "\n"); fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("PHYSICAL_MEMORY_BYTES") + "\n"); fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("VIRTUAL_MEMORY_BYTES") + "\n"); fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes") + "\n"); fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n"); fw.flush(); try { Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator(); while (counters.hasNext()) { Counter cc = counters.next(); fw.write(cc.getName() + "\t" + cc.getCounter() + "\n"); fw.flush(); } } finally { fw.close(); } return 0; }
From source file:mr.WordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { Properties properties = new Properties(); AppProps.addApplicationTag(properties, "tutorials"); AppProps.addApplicationTag(properties, "cluster:development"); AppProps.setApplicationName(properties, "cascading-mapreduce-flow"); JobConf conf = new JobConf(WordCount.class); conf.setJobName("casading-mapreduce-flow"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); MapReduceFlow flow = new MapReduceFlow("wordcount", conf, true); // JobClient.runJob(conf); flow.complete();// ww w . j av a 2 s. c om }