List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass
public void setOutputKeyClass(Class<?> theClass)
From source file:edu.umd.cloud9.webgraph.BuildWebGraph.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildWebGraph.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("ConstructWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); conf.setNumMapTasks(numMappers);/* w w w. j ava2s. co m*/ conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("BuildWebGraph"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.ClueExtractLinks.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), ClueExtractLinks.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); String mappingFile = conf.get("Cloud9.DocnoMappingFile"); if (!fs.exists(new Path(mappingFile))) { throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!"); }/* ww w. ja va2 s . c o m*/ DistributedCache.addCacheFile(new URI(mappingFile), conf); conf.setJobName("ClueExtractLinks"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); conf.setNumMapTasks(numMappers); conf.setNumReduceTasks(numReducers); // TODO: to read!! conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("ClueExtractLinks"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - mapping file: " + mappingFile); LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.CollectHostnames.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), CollectHostnames.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("CollectHostnames"); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); conf.setNumMapTasks(numMappers);// ww w. j a va 2s .c o m conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setPartitionerClass(Partition.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(PairOfIntString.class); conf.setMapOutputValueClass(IntWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); sLogger.info("PropagateHostname"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { sLogger.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.ComputeWeight.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), ComputeWeight.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("ComputeWeights"); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); conf.setNumMapTasks(numMappers);/*from w w w . ja v a2s . com*/ conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setPartitionerClass(Partition.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("ComputeWeight"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.driver.BuildAnchorTextForwardIndex.java
License:Apache License
/** * Runs this tool.// w w w . j a v a 2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } JobConf conf = new JobConf(getConf()); FileSystem fs = FileSystem.get(conf); String collectionPath = args[0]; String outputPath = args[1]; String indexFile = args[2]; LOG.info("Tool name: BuildAnchorTextForwardIndex"); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setJobName("BuildAnchorTextForwardIndex"); conf.setNumMapTasks(100); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(collectionPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.findCounter(Blocks.Total).getCounter(); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(IndexableAnchorTextForwardIndex.class.getName()); out.writeUTF(collectionPath); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 1000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } return 0; }
From source file:edu.umd.cloud9.webgraph.driver.BuildIndexableAnchorCollection.java
License:Apache License
/** * Runs this tool./*from w w w . j av a 2s.c o m*/ */ public int run(String[] args) throws Exception { if (args.length < 5) { printUsage(); return -1; } JobConf conf = new JobConf(getConf()); FileSystem fs = FileSystem.get(conf); String collectionPath = DriverUtil.argValue(args, DriverUtil.CL_INPUT); String outputPath = DriverUtil.argValue(args, DriverUtil.CL_OUTPUT); String docnoMappingClass = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING_CLASS); String docnoMapping = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING); int numReducers = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_NUMBER_OF_REDUCERS)); if (DriverUtil.argExists(args, DriverUtil.CL_MAX_LENGTH)) { conf.setInt("Cloud9.maxContentLength", Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_MAX_LENGTH))); } conf.set("Cloud9.DocnoMappingClass", docnoMappingClass); LOG.info("Tool name: BuildAnchorTextForwardIndex"); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno-mapping class: " + docnoMappingClass); LOG.info(" - docno-mapping file: " + docnoMapping); if (args.length == 6) { LOG.info(" - maximum content length: " + conf.getInt("Cloud9.maxContentLength", 0)); } conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setJobName("BuildIndexableAnchorCollection"); conf.setJarByClass(BuildIndexableAnchorCollection.class); conf.setNumMapTasks(100); conf.setNumReduceTasks(numReducers); DistributedCache.addCacheFile(new URI(docnoMapping), conf); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, new Path(collectionPath)); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IndexableAnchorText.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.webgraph.driver.GenerateTabDelimitedWebGraph.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 4) { printUsage();// ww w . j a v a2s . c om return -1; } JobConf conf = new JobConf(getConf(), GenerateTabDelimitedWebGraph.class); FileSystem fs = FileSystem.get(conf); String inPath = DriverUtil.argValue(args, "-webgraph") + "/" + DriverUtil.OUTPUT_WEBGRAPH; String outPath = DriverUtil.argValue(args, "-output"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); if (fs.exists(outputPath)) { fs.delete(outputPath); } conf.setJobName("TabDelimWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper.class); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.webgraph.driver.SortWebGraph.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();// w w w.j a v a2 s . com return -1; } JobConf conf = new JobConf(getConf(), SortWebGraph.class); FileSystem fs = FileSystem.get(conf); String inputPath = args[0]; String outputPath = args[1]; int numberOfDocuments = Integer.parseInt(args[2]); int numMappers = 1; int numReducers = Integer.parseInt(args[3]); conf.setJobName("SortWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); if (numberOfDocuments == 0) { numberOfDocuments = DEFAULT_NUMBER_OF_DOCUMENTS; } conf.setInt("Cloud9.NumberOfDocuments", numberOfDocuments); conf.setNumMapTasks(numMappers); conf.setNumReduceTasks(numReducers); conf.setMapperClass(IdentityMapper.class); conf.setPartitionerClass(Partition.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("SortAnchorText"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of documents: " + conf.getInt("Cloud9.NumberOfDocuments", DEFAULT_NUMBER_OF_DOCUMENTS)); fs.delete(new Path(outputPath)); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.webgraph.ExtractLinks.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), ExtractLinks.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); String mappingFile = conf.get("Cloud9.DocnoMappingFile"); if (!fs.exists(new Path(mappingFile))) throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!"); DistributedCache.addCacheFile(new URI(mappingFile), conf); conf.setJobName("ExtractLinks"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.setNumMapTasks(numMappers);//from w ww . j a v a 2s.co m conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("ExtractLinks"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - mapping file: " + mappingFile); LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umn.cs.spatialHadoop.operations.ClosestPairHadoop.java
License:Open Source License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//from w w w.j ava 2 s .c o m * @param conf * @param fs * @param file * @return * @throws IOException */ public static <S extends Shape> void cloesetPair(Path file, OperationsParams params) throws IOException { // Try to get file MBR from the MBRs of blocks JobConf job = new JobConf(params, ClosestPairHadoop.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(file.getName() + ".closest_pair_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); outFs.delete(outputPath, true); job.setJobName("ClosestPair"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Point.class); job.setMapperClass(Map0.class); job.setReducerClass(Reduce0.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeArrayInputFormat.class); // job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.setInputPaths(job, file); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); ////////////////////////////////////////////////////////////////////////// System.out.println("Begin second round!"); // 2nd Round job = new JobConf(params, ClosestPairHadoop.class); job.setJobName("Second Round"); job.setOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Point.class); job.setMapperClass(Map1.class); job.setReducerClass(Reduce1.class); clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeArrayInputFormat.class); // job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.setInputPaths(job, outputPath); // The previous output is the current input Path newPath = new Path(outputPath.getName() + "_result"); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, newPath); JobClient.runJob(job); }