List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass)
From source file:edu.umn.cs.spatialHadoop.operations.Contains.java
License:Open Source License
public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Contains.class); LOG.info("Contains journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {//w ww.j a v a 2s . co m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Within"); job.setMapperClass(ContainsMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(ContainsReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java
License:Open Source License
public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Crosses.class); LOG.info("Crosses journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {// w ww . ja v a2s .c o m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Crosses"); job.setMapperClass(CrossesMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(CrossesReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java
License:Open Source License
public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Disjoint.class); LOG.info("Touches journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*from www .ja v a2s . co m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Disjoint"); job.setMapperClass(DisjointMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(DisjointReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
/** * Performs a redistribute join between the given files using the * redistribute join algorithm. Currently, we only support a pair of files. * @param inFiles//from ww w .j a v a 2 s . c om * @param userOutputPath * @param params * @return * @throws IOException */ public static <S extends Shape> long joinStep(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException { long t1 = System.currentTimeMillis(); JobConf job = new JobConf(params, DistributedJoin.class); FileSystem fs[] = new FileSystem[inFiles.length]; for (int i_file = 0; i_file < inFiles.length; i_file++) fs[i_file] = inFiles[i_file].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { do { outputPath = new Path(inFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000)); } while (fs[0].exists(outputPath)); } job.setJobName("DistributedJoin"); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); GlobalIndex<Partition> gindex1 = SpatialSite.getGlobalIndex(fs[0], inFiles[0]); GlobalIndex<Partition> gindex2 = SpatialSite.getGlobalIndex(fs[1], inFiles[1]); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); LOG.info("Joining " + inFiles[0] + " X " + inFiles[1]); if (SpatialSite.isRTree(fs[0], inFiles[0]) && SpatialSite.isRTree(fs[1], inFiles[1])) { job.setInputFormat(DJInputFormatRTree.class); } else { if (isOneShotReadMode) { // Ensure all objects are read in one shot job.setInt(SpatialSite.MaxBytesInOneRead, -1); job.setInt(SpatialSite.MaxShapesInOneRead, -1); } else { job.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead); job.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead); } job.setInputFormat(DJInputFormatArray.class); } // Set input paths and map function if (inFiles[0].equals(inFiles[1])) { // Self join job.setInputFormat(ShapeArrayInputFormat.class); // Remove the spatial filter to ensure all partitions are loaded FileInputFormat.setInputPaths(job, inFiles[0]); if (gindex1 != null && gindex1.isReplicated()) job.setMapperClass(RedistributeJoinMap.class); else job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class); } else { // Binary version of spatial join (two different input files) job.setClass(SpatialSite.FilterClass, SpatialJoinFilter.class, BlockFilter.class); FileInputFormat.setInputPaths(job, inFiles); if ((gindex1 != null && gindex1.isReplicated()) || (gindex2 != null && gindex2.isReplicated())) { // Need the map function with duplicate avoidance step. job.setMapperClass(RedistributeJoinMap.class); } else { // No replication in both indexes, use map function with no dup // avoidance job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class); } } Shape shape = params.getShape("shape"); job.setMapOutputKeyClass(shape.getClass()); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setNumReduceTasks(0); // No reduce needed for this task if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); TextOutputFormat.setOutputPath(job, outputPath); if (!params.getBoolean("background", false)) { LOG.info("Submit job in sync mode"); RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); // Output number of running map tasks Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS); System.out.println("Number of map tasks " + mapTaskCountCounter.getValue()); // Delete output directory if not explicitly set by user if (userOutputPath == null) fs[0].delete(outputPath, true); long t2 = System.currentTimeMillis(); System.out.println("Join time " + (t2 - t1) + " millis"); return resultCount; } else { JobClient jc = new JobClient(job); LOG.info("Submit job in async mode"); lastRunningJob = jc.submitJob(job); LOG.info("Job " + lastRunningJob + " submitted successfully"); return -1; } }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
/** * Spatially joins two datasets by repartitioning the smaller dataset based * on the larger one, then apply one-to-one joining for each partition * /*from w ww .ja v a 2 s .co m*/ * @author Ibrahim Sabek * @param inputFiles * Input datasets to be spatially joined * @param fileToRepartition * Index of which file will be repartitioned * @param outputFile * Output file contains the joining results * @param params * Job configurations * @return * @throws IOException */ protected static long repartitionJoinStep(final Path[] inputFiles, int fileToRepartition, Path outputFile, OperationsParams params) throws IOException { boolean overwrite = params.getBoolean("overwrite", false); Shape stockShape = params.getShape("shape"); // Do the repartition step long t1 = System.currentTimeMillis(); JobConf repartitionJoinJob = new JobConf(params, DistributedJoin.class); repartitionJoinJob.setJobName("RepartitionJoin"); FileSystem fs = inputFiles[fileToRepartition].getFileSystem(params); Path outputPath = outputFile; if (outputPath == null) { do { outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000)); } while (fs.exists(outputPath)); } LOG.info("Repartition - Joining " + inputFiles[0] + " X " + inputFiles[1]); // Get the cells to use for repartitioning GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFiles[1 - fileToRepartition]); OperationsParams.setRepartitionJoinIndexPath(repartitionJoinJob, RepartitionJoinIndexPath, inputFiles[1 - fileToRepartition]); OperationsParams.setInactiveModeFlag(repartitionJoinJob, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(repartitionJoinJob, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(repartitionJoinJob, isFilterOnlyMode, isFilterOnly); CellInfo[] cellsInfo = SpatialSite.cellsOf(fs, inputFiles[1 - fileToRepartition]); // Repartition the file to match the other file boolean isReplicated = gindex.isReplicated(); boolean isCompact = gindex.isCompact(); String sindex; if (isReplicated && !isCompact) sindex = "grid"; else if (isReplicated && isCompact) sindex = "r+tree"; else if (!isReplicated && isCompact) sindex = "rtree"; else throw new RuntimeException("Unknown index at: " + inputFiles[1 - fileToRepartition]); params.set("sindex", sindex); // Decide which map function to use based on the type of global index if (sindex.equals("rtree") || sindex.equals("str")) { // Repartition without replication repartitionJoinJob.setMapperClass(RepartitionMapNoReplication.class); } else { // Repartition with replication (grid and r+tree) repartitionJoinJob.setMapperClass(RepartitionMap.class); } repartitionJoinJob.setMapOutputKeyClass(IntWritable.class); repartitionJoinJob.setMapOutputValueClass(stockShape.getClass()); ShapeInputFormat.setInputPaths(repartitionJoinJob, inputFiles[fileToRepartition]); repartitionJoinJob.setInputFormat(ShapeInputFormat.class); ClusterStatus clusterStatus = new JobClient(repartitionJoinJob).getClusterStatus(); repartitionJoinJob.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); SpatialSite.setCells(repartitionJoinJob, cellsInfo); repartitionJoinJob.setBoolean(SpatialSite.OVERWRITE, overwrite); // set reduce function repartitionJoinJob.setReducerClass(RepartitionJoinReduce.class); repartitionJoinJob.setNumReduceTasks( Math.max(1, Math.min(cellsInfo.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); repartitionJoinJob.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(repartitionJoinJob, outputPath); RunningJob runningJob = JobClient.runJob(repartitionJoinJob); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); // Output number of running map tasks Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS); System.out.println("Number of map tasks " + mapTaskCountCounter.getValue()); // Delete output directory if not explicitly set by user if (outputFile == null) fs.delete(outputPath, true); long t2 = System.currentTimeMillis(); System.out.println("Repartitioning and Joining time " + (t2 - t1) + " millis"); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Equals.java
License:Open Source License
public static <S extends Shape> long equals(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Equals.class); LOG.info("Equals journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*from w ww .ja v a 2 s.c o m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Equals"); job.setMapperClass(EqualsMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(EqualsReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java
License:Open Source License
/** * Computes the MBR of the input file using an aggregate MapReduce job. * //w w w . j a v a2s. c om * @param inFile - Path to input file * @param params - Additional operation parameters * @return * @throws IOException * @throws InterruptedException */ private static <S extends Shape> Partition fileMBRMapReduce(Path[] inFiles, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(inFiles[0].getName() + ".mbr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("FileMBR"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Partition.class); job.setMapperClass(FileMBRMapper.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Combine.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, inFiles); TextOutputFormat.setOutputPath(job, outputPath); job.setOutputCommitter(MBROutputCommitter.class); // Submit the job if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); // Use multithreading too job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors()); } if (params.getBoolean("background", false)) { JobClient jc = new JobClient(job); lastSubmittedJob = jc.submitJob(job); return null; } else { lastSubmittedJob = JobClient.runJob(job); Counters counters = lastSubmittedJob.getCounters(); Counter outputSizeCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); sizeOfLastProcessedFile = outputSizeCounter.getCounter(); FileStatus[] outFiles = outFs.listStatus(outputPath, SpatialSite.NonHiddenFileFilter); Partition mbr = new Partition(); mbr.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); OperationsParams localMBRParams = new OperationsParams(params); localMBRParams.setBoolean("local", true); // Enforce local execution localMBRParams.setClass("shape", Partition.class, Shape.class); for (FileStatus outFile : outFiles) { if (outFile.isDir()) continue; ShapeRecordReader<Partition> reader = new ShapeRecordReader<Partition>(localMBRParams, new FileSplit(outFile.getPath(), 0, outFile.getLen(), new String[0])); Rectangle key = reader.createKey(); Partition p = reader.createValue(); while (reader.next(key, p)) { mbr.expand(p); } reader.close(); } outFs.delete(outputPath, true); return mbr; } }
From source file:edu.umn.cs.spatialHadoop.operations.Indexer.java
License:Open Source License
private static RunningJob indexMapReduce(Path inPath, Path outPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Indexer.class); job.setJobName("Indexer"); // Set input file MBR if not already set Rectangle inputMBR = (Rectangle) params.getShape("mbr"); if (inputMBR == null) inputMBR = FileMBR.fileMBR(inPath, params); OperationsParams.setShape(job, "mbr", inputMBR); // Set input and output job.setInputFormat(ShapeIterInputFormat.class); ShapeIterInputFormat.setInputPaths(job, inPath); job.setOutputFormat(IndexOutputFormat.class); GridOutputFormat.setOutputPath(job, outPath); // Set the correct partitioner according to index type String index = job.get("sindex"); if (index == null) throw new RuntimeException("Index type is not set"); long t1 = System.currentTimeMillis(); Partitioner partitioner = createPartitioner(inPath, outPath, job, index); Partitioner.setPartitioner(job, partitioner); long t2 = System.currentTimeMillis(); System.out.println("Total time for space subdivision in millis: " + (t2 - t1)); // Set mapper and reducer Shape shape = params.getShape("shape"); job.setMapperClass(IndexMethods.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setReducerClass(IndexMethods.class); job.setOutputCommitter(IndexerOutputCommitter.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); // Use multithreading in case the job is running locally job.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors()); // Start the job if (params.getBoolean("background", false)) { // Run in background JobClient jc = new JobClient(job); return jc.submitJob(job); } else {/* ww w. j a v a 2s . c o m*/ // Run and block until it is finished return JobClient.runJob(job); } }
From source file:edu.umn.cs.spatialHadoop.operations.Intersects.java
License:Open Source License
public static <S extends Shape> long intersects(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Intersects.class); LOG.info("Intersects journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*from www . ja v a 2s . c o m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Intersects"); job.setMapperClass(IntersectsMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(IntersectsReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Overlaps.java
License:Open Source License
public static <S extends Shape> long overlaps(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Overlaps.class); LOG.info("Overlaps journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/* w w w .j a va2 s.c o m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Overlaps"); job.setMapperClass(OverlapMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(OverlapReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }