List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat
public void setOutputFormat(Class<? extends OutputFormat> theClass)
From source file:edu.umd.cloud9.webgraph.ExtractLinks.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), ExtractLinks.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); String mappingFile = conf.get("Cloud9.DocnoMappingFile"); if (!fs.exists(new Path(mappingFile))) throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!"); DistributedCache.addCacheFile(new URI(mappingFile), conf); conf.setJobName("ExtractLinks"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.setNumMapTasks(numMappers);/* ww w .j a v a2 s . c o m*/ conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("ExtractLinks"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - mapping file: " + mappingFile); LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umn.cs.spatialHadoop.nasa.DistributedAggregateSpatioTemporalIndexer.java
License:Open Source License
/** * Build a bunch of AggregateQuadTrees using a Map-Reduce job * //from w w w .ja v a 2 s . c om * @param inputPathsDictionaryPath * @param params * @throws IOException */ public static void aggregateQuadTreeMapReduce(Path inputPathsDictionaryPath, OperationsParams params) throws IOException { // configure a map-reduce job JobConf job = new JobConf(params, DistributedAggregateSpatioTemporalIndexer.class); Path outputPath; String outputPathPrefix = "aggQuadTree_"; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(outputPathPrefix + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("AggregateQuadTree"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(AggregateQuadTreeMaper.class); job.set(HDFSIndexPath, hdfsIndexPath.toString()); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); TextInputFormat.setInputPaths(job, inputPathsDictionaryPath); TextOutputFormat.setOutputPath(job, outputPath); if (job.getBoolean("local", false)) { // Enforce local execution if explicitly set by user or for small // files job.set("mapred.job.tracker", "local"); // Use multithreading too job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, 16); } job.setNumReduceTasks(0); // Submit the job JobClient.runJob(job); outFs.delete(outputPath, true); }
From source file:edu.umn.cs.spatialHadoop.operations.Aggregate.java
License:Open Source License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//from w w w . j a va2s .c o m * @param files * @param params * @return * @throws IOException */ public static MinMax aggregateMapReduce(Path[] files, OperationsParams params) throws IOException { Shape plotRange = params.getShape("rect"); JobConf job = new JobConf(params, Aggregate.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path("agg_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("Aggregate"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(MinMax.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeInputFormat.class); job.setClass("shape", NASAPoint.class, Shape.class); if (plotRange != null) { job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class); } job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, files); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); // Read job result FileStatus[] results = outFs.listStatus(outputPath); MinMax minMax = new MinMax(); for (FileStatus status : results) { if (status.getLen() > 0 && status.getPath().getName().startsWith("part-")) { BufferedReader reader = new BufferedReader(new InputStreamReader(outFs.open(status.getPath()))); String line; MinMax value = new MinMax(); while ((line = reader.readLine()) != null) { value.fromText(new Text(line)); minMax.expand(value); } reader.close(); } } outFs.delete(outputPath, true); return minMax; }
From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java
License:Open Source License
/** * Calculates the union of a set of shapes categorized by some user defined * category./*from w ww .j av a2 s . c o m*/ * @param shapeFile - Input file that contains shapes * @param categoryFile - Category file that contains the category of each * shape.Shapes not appearing in this file are not generated in output. * @param output - An output file that contains each category and the union * of all shapes in it. Each line contains the category, then a comma, * then the union represented as text. * @throws IOException */ public static void unionMapReduce(Path shapeFile, Path categoryFile, Path output, OperationsParams params) throws IOException { JobConf job = new JobConf(params, CatUnion.class); job.setJobName("Union"); // Check output file existence FileSystem outFs = output.getFileSystem(job); if (outFs.exists(output)) { if (params.getBoolean("overwrite", false)) { outFs.delete(output, true); } else { throw new RuntimeException("Output path already exists and -overwrite flag is not set"); } } // Set map and reduce ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10)); job.setMapperClass(UnionMapper.class); job.setCombinerClass(UnionReducer.class); job.setReducerClass(UnionReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // Set input and output job.setInputFormat(ShapeLineInputFormat.class); TextInputFormat.addInputPath(job, shapeFile); DistributedCache.addCacheFile(categoryFile.toUri(), job); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, output); // Start job JobClient.runJob(job); }
From source file:edu.umn.cs.spatialHadoop.operations.ClosestPairHadoop.java
License:Open Source License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//from ww w .j a v a 2 s . c o m * @param conf * @param fs * @param file * @return * @throws IOException */ public static <S extends Shape> void cloesetPair(Path file, OperationsParams params) throws IOException { // Try to get file MBR from the MBRs of blocks JobConf job = new JobConf(params, ClosestPairHadoop.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(file.getName() + ".closest_pair_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); outFs.delete(outputPath, true); job.setJobName("ClosestPair"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Point.class); job.setMapperClass(Map0.class); job.setReducerClass(Reduce0.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeArrayInputFormat.class); // job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.setInputPaths(job, file); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); ////////////////////////////////////////////////////////////////////////// System.out.println("Begin second round!"); // 2nd Round job = new JobConf(params, ClosestPairHadoop.class); job.setJobName("Second Round"); job.setOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Point.class); job.setMapperClass(Map1.class); job.setReducerClass(Reduce1.class); clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeArrayInputFormat.class); // job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.setInputPaths(job, outputPath); // The previous output is the current input Path newPath = new Path(outputPath.getName() + "_result"); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, newPath); JobClient.runJob(job); }
From source file:edu.umn.cs.spatialHadoop.operations.Contains.java
License:Open Source License
public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Contains.class); LOG.info("Contains journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {// ww w . j a v a 2 s. c o m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Within"); job.setMapperClass(ContainsMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(ContainsReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.ConvexHull.java
License:Open Source License
public static void convexHullMapReduce(Path inFile, Path userOutPath, OperationsParams params) throws IOException { JobConf job = new JobConf(params, ConvexHull.class); Path outPath = userOutPath;//from ww w. ja v a 2 s . c om FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job); Shape shape = params.getShape("shape"); if (outPath == null) { do { outPath = new Path(inFile.toUri().getPath() + ".convex_hull_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outPath)); } else { if (outFs.exists(outPath)) { if (params.getBoolean("overwrite", false)) { outFs.delete(outPath, true); } else { throw new RuntimeException("Output path already exists and -overwrite flag is not set"); } } } job.setJobName("ConvexHull"); job.setClass(SpatialSite.FilterClass, ConvexHullFilter.class, BlockFilter.class); job.setMapperClass(IdentityMapper.class); job.setCombinerClass(ConvexHullReducer.class); job.setReducerClass(ConvexHullReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(shape.getClass()); job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.addInputPath(job, inFile); job.setOutputFormat(GridOutputFormat2.class); GridOutputFormat2.setOutputPath(job, outPath); JobClient.runJob(job); // If outputPath not set by user, automatically delete it if (userOutPath == null) outFs.delete(outPath, true); }
From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java
License:Open Source License
public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Crosses.class); LOG.info("Crosses journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/* w ww .j a v a 2 s . co m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Crosses"); job.setMapperClass(CrossesMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(CrossesReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java
License:Open Source License
public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Disjoint.class); LOG.info("Touches journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {//from w w w. j a v a 2 s. c o m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Disjoint"); job.setMapperClass(DisjointMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(DisjointReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedCopy.java
License:Open Source License
private static void distributedCopy(Path inputPath, Path outputPath, OperationsParams params) throws IOException { JobConf job = new JobConf(params, DistributedCopy.class); job.setJobName("distcp3"); // Set input//from w w w. j av a2s. com job.setInputFormat(BlockInputFormat.class); BlockInputFormat.addInputPath(job, inputPath); // Set output job.setOutputFormat(BlockOutputFormat.class); BlockOutputFormat.setOutputPath(job, outputPath); job.setOutputCommitter(BlockOutputCommitter.class); // Set number of mappers/reducers ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(0); // Run the job JobClient.runJob(job); }