List of usage examples for org.apache.hadoop.mapred JobConf setSpeculativeExecution
public void setSpeculativeExecution(boolean speculativeExecution)
From source file:com.test.PiEstimatorKrb.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi//from w w w . j ava2 s . c o m */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { //setup job conf jobConf.setJobName(PiEstimatorKrb.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } sLogger.info("Wrote input for Map #" + i); } //start a map/reduce job sLogger.info("Starting Job"); final long startTime = System.currentTimeMillis(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; sLogger.info("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
From source file:edu.umd.cloud9.pagerank.PartitionGraph.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length != 5) { printUsage();//from w ww . j a v a 2 s.c o m return -1; } String inPath = args[0]; String outPath = args[1]; int numParts = Integer.parseInt(args[2]); boolean useRange = Integer.parseInt(args[3]) != 0; int nodeCount = Integer.parseInt(args[4]); sLogger.info("Tool name: PartitionGraph"); sLogger.info(" - inputDir: " + inPath); sLogger.info(" - outputDir: " + outPath); sLogger.info(" - numPartitions: " + numParts); sLogger.info(" - useRange?: " + useRange); sLogger.info(" - nodeCnt: " + nodeCount); JobConf conf = new JobConf(PartitionGraph.class); conf.setJobName("Partition Graph " + numParts); conf.setNumReduceTasks(numParts); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("NodeCount", nodeCount); FileInputFormat.setInputPaths(conf, new Path(inPath)); FileOutputFormat.setOutputPath(conf, new Path(outPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MapClass.class); conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } FileSystem.get(conf).delete(new Path(outPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;/* www.ja va2 s . co m*/ } sLogger.info("PageRank: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); conf.setInt("NodeCount", n); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; FileSystem fs = FileSystem.get(conf); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;/*from w ww . j a v a 2 s .c o m*/ } conf.setInt("NodeCount", n); Partitioner p = null; if (useRange) { p = new RangePartitioner<IntWritable, Writable>(); p.configure(conf); } else { p = new HashPartitioner<WritableComparable, Writable>(); } // this is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (f.getPath().getName().contains("_logs")) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); sLogger.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + "\t"); } sLogger.info(sb.toString().trim()); sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - numPartitions: " + numPartitions); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:ivory.preprocess.BuildTermDocVectors.java
License:Apache License
private void writeDoclengthsData(int collectionDocCount) throws IOException { JobConf conf = new JobConf(getConf(), GetTermCount.class); String indexPath = conf.get("Ivory.IndexPath"); String collectionName = conf.get("Ivory.CollectionName"); int docnoOffset = conf.getInt("Ivory.DocnoOffset", 0); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path dlFile = env.getDoclengthsData(); Path inputPath = env.getDoclengthsDirectory(); sLogger.info("Writing doc length data to " + dlFile + "..."); conf.setJobName("DocLengthTable:" + collectionName); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCount); conf.set("InputPath", inputPath.toString()); conf.set("DocLengthDataFile", dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setNumMapTasks(1);//w w w .ja v a2s . co m conf.setNumReduceTasks(0); conf.setSpeculativeExecution(false); conf.setInputFormat(NullInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(DocLengthDataWriterMapper.class); RunningJob job = JobClient.runJob(conf); env.writeDocnoOffset(docnoOffset); Counters counters = job.getCounters(); long collectionSumOfDocLengths = (long) counters.findCounter(DocLengths.SumOfDocLengths).getCounter(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); }
From source file:map_reduce.MapReduce_OptimizedBrandesAdditions_DO_JUNG.java
License:Open Source License
@SuppressWarnings("deprecation") @Override/*from w w w .j a v a2 s. c o m*/ public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage:\n"); System.exit(1); } // Job job = new Job(super.getConf()); // READ IN ALL COMMAND LINE ARGUMENTS // EXAMPLE: // hadoop jar MapReduce_OptimizedBrandesAdditions_DO_JUNG.jar // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar // -Dmapred.job.map.memory.mb=4096 // -Dmapred.job.reduce.memory.mb=4096 // -Dmapred.child.java.opts=-Xmx3500m // -Dmapreduce.task.timeout=60000000 // -Dmapreduce.job.queuename=QUEUENAME // input_iterbrandes_additions_nocomb_10k_1 output_iterbrandes_additions_nocomb_10k_1 // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/ int m = -1; // input path to use on hdfs Path inputPath = new Path(args[++m]); // output path to use on hdfs Path outputPath = new Path(args[++m]); // number of Mappers to split the sources: e.g., 1, 10, 100 etc. // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number. int numOfMaps = Integer.parseInt(args[++m]); // number of Reducers to collect the output int numOfReduce = Integer.parseInt(args[++m]); // Number of vertices in graph int N = Integer.parseInt(args[++m]); // Number of edges in graph int M = Integer.parseInt(args[++m]); // Graph file (edge list, tab delimited) (full path) String graph = args[++m]; // File with edges to be added (tab delimited) (full path) // Note: this version handles only edges between existing vertices in the graph. String random_edges = args[++m]; // Number of random edges added int re = Integer.parseInt(args[++m]); // Experiment iteration (in case of multiple experiments) int iter = Integer.parseInt(args[++m]); // Use combiner or not (true/false) Boolean comb = Boolean.valueOf(args[++m]); // Output path for file with stats String statsoutputpath = args[++m]; // Output path for file with final betweenness values String betoutputpath = args[++m]; // BEGIN INITIALIZATION JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesAdditions_DO_JUNG.class); FileSystem fs = FileSystem.get(conf); String setup = "_additions_edges" + re + "_maps" + numOfMaps + "_comb" + comb; conf.setJobName("OptimizedBrandesAdditionsDOJung_" + graph + setup + "_" + iter); conf.set("HDFS_GRAPH", graph + setup); conf.set("HDFS_Random_Edges", random_edges + setup); conf.set("output", outputPath.getName()); conf.set("setup", setup); // CREATE INPUT FILES FOR MAPPERS int numOfTasksperMap = (int) Math.ceil(N / numOfMaps); //generate an input file for each map task for (int i = 0; i < numOfMaps - 1; i++) { Path file = new Path(inputPath, "part-r-" + i); IntWritable start = new IntWritable(i * numOfTasksperMap); IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end); } // last mapper takes what is left Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1)); IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap); IntWritable end = new IntWritable(N - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end); // COPY FILES TO MAPPERS System.out.println("Copying graph to cache"); String LOCAL_GRAPH = graph; Path hdfsPath = new Path(graph + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); System.out.println("Copying random edges to cache"); String LOCAL_Random_Edges = random_edges; hdfsPath = new Path(random_edges + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(IterBrandesMapper.class); conf.setNumMapTasks(numOfMaps); if (comb) conf.setCombinerClass(IterBrandesReducer.class); conf.setReducerClass(IterBrandesReducer.class); conf.setNumReduceTasks(numOfReduce); // turn off speculative execution, because DFS doesn't handle multiple writers to the same file. conf.setSpeculativeExecution(false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); // conf.set("mapred.job.name", "APS-" + outputPath.getName()); conf.setNumTasksToExecutePerJvm(-1); // JVM reuse System.out.println("Starting the execution...! Pray!! \n"); long time1 = System.nanoTime(); RunningJob rj = JobClient.runJob(conf); long time2 = System.nanoTime(); // READ OUTPUT FILES System.out.println("\nFinished and now reading/writing Betweenness Output...\n"); // Assuming 1 reducer. Path inFile = new Path(outputPath, "part-00000"); IntWritable id = new IntWritable(); DoubleWritable betweenness = new DoubleWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf); FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter)); try { int i = 0; for (; i < (N + M + re); i++) { reader.next(id, betweenness); fw.write(id + "\t" + betweenness + "\n"); fw.flush(); } } finally { reader.close(); fw.close(); } System.out.println("\nWriting times Output...\n"); fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter)); fw.write("Total-time:\t" + (time2 - time1) + "\n"); fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_MAPS") + "\n"); fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_REDUCES") + "\n"); fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("CPU_MILLISECONDS") + "\n"); fw.write("total-gc-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS") + "\n"); fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("PHYSICAL_MEMORY_BYTES") + "\n"); fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("VIRTUAL_MEMORY_BYTES") + "\n"); fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes") + "\n"); fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n"); fw.flush(); try { Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator(); while (counters.hasNext()) { Counter cc = counters.next(); fw.write(cc.getName() + "\t" + cc.getCounter() + "\n"); fw.flush(); } } finally { fw.close(); } return 0; }
From source file:map_reduce.MapReduce_OptimizedBrandesDeletions_DO_JUNG.java
License:Open Source License
@SuppressWarnings("deprecation") @Override/*from w w w. j ava 2 s . c om*/ public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage:\n"); System.exit(1); } // Job job = new Job(super.getConf()); // READ IN ALL COMMAND LINE ARGUMENTS // EXAMPLE: // hadoop jar MapReduce_OptimizedBrandesDeletions_DO_JUNG.jar // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar // -Dmapred.job.map.memory.mb=4096 // -Dmapred.job.reduce.memory.mb=4096 // -Dmapred.child.java.opts=-Xmx3500m // -Dmapreduce.task.timeout=60000000 // -Dmapreduce.job.queuename=QUEUENAME // input_iterbrandes_deletions_nocomb_10k_1 output_iterbrandes_deletions_nocomb_10k_1 // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/ int m = -1; // input path to use on hdfs Path inputPath = new Path(args[++m]); // output path to use on hdfs Path outputPath = new Path(args[++m]); // number of Mappers to split the sources: e.g., 1, 10, 100 etc. // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number. int numOfMaps = Integer.parseInt(args[++m]); // number of Reducers to collect the output int numOfReduce = Integer.parseInt(args[++m]); // Number of vertices in graph int N = Integer.parseInt(args[++m]); // Number of edges in graph int M = Integer.parseInt(args[++m]); // Graph file (edge list, tab delimited) (full path) String graph = args[++m]; // File with edges to be added (tab delimited) (full path) // Note: this version handles only edges between existing vertices in the graph. String random_edges = args[++m]; // Number of random edges added int re = Integer.parseInt(args[++m]); // Experiment iteration (in case of multiple experiments) int iter = Integer.parseInt(args[++m]); // Use combiner or not (true/false) Boolean comb = Boolean.valueOf(args[++m]); // Output path for file with stats String statsoutputpath = args[++m]; // Output path for file with final betweenness values String betoutputpath = args[++m]; // BEGIN INITIALIZATION JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesDeletions_DO_JUNG.class); FileSystem fs = FileSystem.get(conf); String setup = "_deletions_edges" + re + "_maps" + numOfMaps + "_comb" + comb; conf.setJobName("OptimizedBrandesDeletionsDOJung_" + graph + setup + "_" + iter); conf.set("HDFS_GRAPH", graph + setup); conf.set("HDFS_Random_Edges", random_edges + setup); conf.set("output", outputPath.getName()); conf.set("setup", setup); // CREATE INPUT FILES FOR MAPPERS int numOfTasksperMap = (int) Math.ceil(N / numOfMaps); //generate an input file for each map task for (int i = 0; i < numOfMaps - 1; i++) { Path file = new Path(inputPath, "part-r-" + i); IntWritable start = new IntWritable(i * numOfTasksperMap); IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end); } // last mapper takes what is left Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1)); IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap); IntWritable end = new IntWritable(N - 1); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(start, end); } finally { writer.close(); } System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end); // COPY FILES TO MAPPERS System.out.println("Copying graph to cache"); String LOCAL_GRAPH = graph; Path hdfsPath = new Path(graph + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); System.out.println("Copying random edges to cache"); String LOCAL_Random_Edges = random_edges; hdfsPath = new Path(random_edges + setup); // upload the file to hdfs. Overwrite any existing copy. fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath); DistributedCache.addCacheFile(hdfsPath.toUri(), conf); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(IterBrandesMapper.class); conf.setNumMapTasks(numOfMaps); if (comb) conf.setCombinerClass(IterBrandesReducer.class); conf.setReducerClass(IterBrandesReducer.class); conf.setNumReduceTasks(numOfReduce); // turn off speculative execution, because DFS doesn't handle multiple writers to the same file. conf.setSpeculativeExecution(false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); // conf.set("mapred.job.name", "APS-" + outputPath.getName()); conf.setNumTasksToExecutePerJvm(-1); // JVM reuse System.out.println("Starting the execution...! Pray!! \n"); long time1 = System.nanoTime(); RunningJob rj = JobClient.runJob(conf); long time2 = System.nanoTime(); // READ OUTPUT FILES System.out.println("\nFinished and now reading/writing Betweenness Output...\n"); // Assuming 1 reducer. Path inFile = new Path(outputPath, "part-00000"); IntWritable id = new IntWritable(); DoubleWritable betweenness = new DoubleWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf); FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter)); try { int i = 0; for (; i < (N + (M - re)); i++) { reader.next(id, betweenness); fw.write(id + "\t" + betweenness + "\n"); fw.flush(); } } finally { reader.close(); fw.close(); } System.out.println("\nWriting times Output...\n"); fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter)); fw.write("Total-time:\t" + (time2 - time1) + "\n"); fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_MAPS") + "\n"); fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("SLOTS_MILLIS_REDUCES") + "\n"); fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("CPU_MILLISECONDS") + "\n"); fw.write("total-gc-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS") + "\n"); fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("PHYSICAL_MEMORY_BYTES") + "\n"); fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter") .getCounter("VIRTUAL_MEMORY_BYTES") + "\n"); fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes") + "\n"); fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n"); fw.flush(); try { Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator(); while (counters.hasNext()) { Counter cc = counters.next(); fw.write(cc.getName() + "\t" + cc.getCounter() + "\n"); fw.flush(); } } finally { fw.close(); } return 0; }
From source file:net.peacesoft.nutch.crawl.ReFetcher.java
License:Apache License
public void fetch(Path segment, int threads) throws IOException { checkConfiguration();// ww w .j a v a 2s. co m SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("ReFetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(ReFetcher.InputFormat.class); job.setMapRunnerClass(ReFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:net.sf.katta.indexing.IndexerJob.java
License:Apache License
public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException { // create job conf with class pointing into job jar. JobConf jobConf = new JobConf(IndexerJob.class); jobConf.setJobName("indexer"); jobConf.setMapRunnerClass(Indexer.class); // alternative use a text file and a TextInputFormat jobConf.setInputFormat(SequenceFileInputFormat.class); Path input = new Path(path); FileInputFormat.setInputPaths(jobConf, input); // we just set the output path to make hadoop happy. FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination)); // setting the folder where lucene indexes will be copied when finished. jobConf.set("finalDestination", finalDestination); // important to switch spec exec off. // We dont want to have something duplicated. jobConf.setSpeculativeExecution(false); // The num of map tasks is equal to the num of input splits. // The num of input splits by default is equal to the num of hdf blocks // for the input file(s). To get the right num of shards we need to // calculate the best input split size. FileSystem fs = FileSystem.get(input.toUri(), jobConf); FileStatus[] status = fs.globStatus(input); long size = 0; for (FileStatus fileStatus : status) { size += fileStatus.getLen();//from w w w . j av a 2 s .c o m } long optimalSplisize = size / numOfShards; jobConf.set("mapred.min.split.size", "" + optimalSplisize); // give more mem to lucene tasks. jobConf.set("mapred.child.java.opts", "-Xmx2G"); jobConf.setNumMapTasks(1); jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); }
From source file:org.apache.ambari.servicemonitor.jobs.FileUsingJobRunner.java
License:Apache License
public int run(String[] args) throws Exception { // Configuration processed by ToolRunner Configuration conf = getConf(); CommandLine commandLine = getCommandLine(); // Create a JobConf using the processed conf JobConf jobConf = new JobConf(conf, FileUsingJobRunner.class); //tune the config if (jobConf.get(JobKeys.RANGEINPUTFORMAT_ROWS) == null) { jobConf.setInt(JobKeys.RANGEINPUTFORMAT_ROWS, 1); }/*from w w w . j a v a2s . c o m*/ // Process custom command-line options String name = OptionHelper.getStringOption(commandLine, "n", "File Using Job"); if (commandLine.hasOption('x')) { //delete the output directory String destDir = jobConf.get(JobKeys.MAPRED_OUTPUT_DIR); FileSystem fs = FileSystem.get(jobConf); fs.delete(new Path(destDir), true); } // Specify various job-specific parameters jobConf.setMapperClass(FileUsingMapper.class); jobConf.setReducerClass(FileUsingReducer.class); jobConf.setMapOutputKeyClass(IntWritable.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputFormat(TextOutputFormat.class); jobConf.setInputFormat(RangeInputFormat.class); //jobConf.setPartitionerClass(SleepJob.class); jobConf.setSpeculativeExecution(false); jobConf.setJobName(name); jobConf.setJarByClass(this.getClass()); FileInputFormat.addInputPath(jobConf, new Path("ignored")); // Submit the job, then poll for progress until the job is complete RunningJob runningJob = JobClient.runJob(jobConf); runningJob.waitForCompletion(); return runningJob.isSuccessful() ? 0 : 1; }