List of usage examples for org.apache.hadoop.mapred JobConf setClass
public void setClass(String name, Class<?> theClass, Class<?> xface)
name
property to the name of a theClass
implementing the given interface xface
. From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Sample a ratio of the file through a MapReduce job * @param fs/*www.j a va 2 s . c o m*/ * @param files * @param ratio * @param threshold - Maximum number of elements to be sampled * @param output * @param inObj * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio( FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("Sample"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setClass(InClass, inObj.getClass(), TextSerializable.class); job.setClass(OutClass, outObj.getClass(), TextSerializable.class); job.setMapperClass(Map.class); job.setLong(RANDOM_SEED, seed); job.setFloat(SAMPLE_RATIO, (float) ratio); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(0); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, files); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job RunningJob run_job = JobClient.runJob(job); Counters counters = run_job.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Ratio of records to return from output based on the threshold // Note that any number greater than or equal to one will cause all // elements to be returned final double selectRatio = (double) threshold / resultCount; // Read job result int result_size = 0; if (output != null) { Text line = new Text(); FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); try { while (lineReader.readLine(line) > 0) { if (Math.random() < selectRatio) { if (output != null) { outObj.fromText(line); output.collect(outObj); } result_size++; } } } catch (RuntimeException e) { e.printStackTrace(); } lineReader.close(); } } } outFs.delete(outputPath, true); return result_size; }
From source file:com.scaleoutsoftware.soss.hserver.NamedMapInputFormatMapred.java
License:Apache License
/** * Sets {@link com.scaleoutsoftware.soss.client.map.NamedMap} as an input source for the job. * * @param configuration job to modify/*from ww w .j a v a2 s .c om*/ * @param map name of the map to be used as a job input * @param <K> the type of the key * @param <V> the type of the value */ public static <K, V> void setNamedMap(JobConf configuration, NamedMap<K, V> map) { configuration.setInt(inputAppIdProperty, map.getMapId()); CustomSerializer<K> keySerializer = map.getKeySerializer(); CustomSerializer<V> valueSerializer = map.getValueSerializer(); configuration.setClass(inputNamedMapKeySerializerProperty, keySerializer.getClass(), Object.class); configuration.setClass(inputNamedMapValueSerializerProperty, valueSerializer.getClass(), Object.class); configuration.setInt(SERIALIZATION_MODE, map.getSerializationMode().ordinal()); if (keySerializer.getObjectClass() != null) { configuration.setClass(inputNamedMapKeyProperty, keySerializer.getObjectClass(), Object.class); } if (valueSerializer.getObjectClass() != null) { configuration.setClass(inputNamedMapValueProperty, valueSerializer.getObjectClass(), Object.class); } }
From source file:com.scaleoutsoftware.soss.hserver.NamedMapOutputFormatMapred.java
License:Apache License
/** * Sets the {@link NamedMap} to direct output to. * * @param configuration job to modify/*from w ww. jav a 2s. c o m*/ * @param map named map to be used for output */ public static void setNamedMap(JobConf configuration, NamedMap map) { configuration.setStrings(outputNamedMapProperty, map.getMapName()); CustomSerializer keySerializer = map.getKeySerializer(); CustomSerializer valueSerializer = map.getValueSerializer(); configuration.setClass(outputNamedMapKeySerializerProperty, keySerializer.getClass(), Object.class); configuration.setClass(outputNamedMapValueSerializerProperty, valueSerializer.getClass(), Object.class); if (keySerializer.getObjectClass() != null) { configuration.setClass(outputNamedMapKeyProperty, keySerializer.getObjectClass(), Object.class); } if (valueSerializer.getObjectClass() != null) { configuration.setClass(outputNamedMapValueProperty, valueSerializer.getObjectClass(), Object.class); } }
From source file:edu.umn.cs.spatialHadoop.operations.Aggregate.java
License:Open Source License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//w w w. j av a2 s . c o m * @param files * @param params * @return * @throws IOException */ public static MinMax aggregateMapReduce(Path[] files, OperationsParams params) throws IOException { Shape plotRange = params.getShape("rect"); JobConf job = new JobConf(params, Aggregate.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path("agg_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("Aggregate"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(MinMax.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeInputFormat.class); job.setClass("shape", NASAPoint.class, Shape.class); if (plotRange != null) { job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class); } job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, files); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); // Read job result FileStatus[] results = outFs.listStatus(outputPath); MinMax minMax = new MinMax(); for (FileStatus status : results) { if (status.getLen() > 0 && status.getPath().getName().startsWith("part-")) { BufferedReader reader = new BufferedReader(new InputStreamReader(outFs.open(status.getPath()))); String line; MinMax value = new MinMax(); while ((line = reader.readLine()) != null) { value.fromText(new Text(line)); minMax.expand(value); } reader.close(); } } outFs.delete(outputPath, true); return minMax; }
From source file:edu.umn.cs.spatialHadoop.operations.ConvexHull.java
License:Open Source License
public static void convexHullMapReduce(Path inFile, Path userOutPath, OperationsParams params) throws IOException { JobConf job = new JobConf(params, ConvexHull.class); Path outPath = userOutPath;/*from ww w.ja v a2 s .c o m*/ FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job); Shape shape = params.getShape("shape"); if (outPath == null) { do { outPath = new Path(inFile.toUri().getPath() + ".convex_hull_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outPath)); } else { if (outFs.exists(outPath)) { if (params.getBoolean("overwrite", false)) { outFs.delete(outPath, true); } else { throw new RuntimeException("Output path already exists and -overwrite flag is not set"); } } } job.setJobName("ConvexHull"); job.setClass(SpatialSite.FilterClass, ConvexHullFilter.class, BlockFilter.class); job.setMapperClass(IdentityMapper.class); job.setCombinerClass(ConvexHullReducer.class); job.setReducerClass(ConvexHullReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(shape.getClass()); job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.addInputPath(job, inFile); job.setOutputFormat(GridOutputFormat2.class); GridOutputFormat2.setOutputPath(job, outPath); JobClient.runJob(job); // If outputPath not set by user, automatically delete it if (userOutPath == null) outFs.delete(outPath, true); }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
/** * Performs a redistribute join between the given files using the * redistribute join algorithm. Currently, we only support a pair of files. * @param inFiles// w w w .j a v a 2 s. c o m * @param userOutputPath * @param params * @return * @throws IOException */ public static <S extends Shape> long joinStep(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException { long t1 = System.currentTimeMillis(); JobConf job = new JobConf(params, DistributedJoin.class); FileSystem fs[] = new FileSystem[inFiles.length]; for (int i_file = 0; i_file < inFiles.length; i_file++) fs[i_file] = inFiles[i_file].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { do { outputPath = new Path(inFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000)); } while (fs[0].exists(outputPath)); } job.setJobName("DistributedJoin"); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); GlobalIndex<Partition> gindex1 = SpatialSite.getGlobalIndex(fs[0], inFiles[0]); GlobalIndex<Partition> gindex2 = SpatialSite.getGlobalIndex(fs[1], inFiles[1]); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); LOG.info("Joining " + inFiles[0] + " X " + inFiles[1]); if (SpatialSite.isRTree(fs[0], inFiles[0]) && SpatialSite.isRTree(fs[1], inFiles[1])) { job.setInputFormat(DJInputFormatRTree.class); } else { if (isOneShotReadMode) { // Ensure all objects are read in one shot job.setInt(SpatialSite.MaxBytesInOneRead, -1); job.setInt(SpatialSite.MaxShapesInOneRead, -1); } else { job.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead); job.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead); } job.setInputFormat(DJInputFormatArray.class); } // Set input paths and map function if (inFiles[0].equals(inFiles[1])) { // Self join job.setInputFormat(ShapeArrayInputFormat.class); // Remove the spatial filter to ensure all partitions are loaded FileInputFormat.setInputPaths(job, inFiles[0]); if (gindex1 != null && gindex1.isReplicated()) job.setMapperClass(RedistributeJoinMap.class); else job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class); } else { // Binary version of spatial join (two different input files) job.setClass(SpatialSite.FilterClass, SpatialJoinFilter.class, BlockFilter.class); FileInputFormat.setInputPaths(job, inFiles); if ((gindex1 != null && gindex1.isReplicated()) || (gindex2 != null && gindex2.isReplicated())) { // Need the map function with duplicate avoidance step. job.setMapperClass(RedistributeJoinMap.class); } else { // No replication in both indexes, use map function with no dup // avoidance job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class); } } Shape shape = params.getShape("shape"); job.setMapOutputKeyClass(shape.getClass()); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setNumReduceTasks(0); // No reduce needed for this task if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); TextOutputFormat.setOutputPath(job, outputPath); if (!params.getBoolean("background", false)) { LOG.info("Submit job in sync mode"); RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); // Output number of running map tasks Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS); System.out.println("Number of map tasks " + mapTaskCountCounter.getValue()); // Delete output directory if not explicitly set by user if (userOutputPath == null) fs[0].delete(outputPath, true); long t2 = System.currentTimeMillis(); System.out.println("Join time " + (t2 - t1) + " millis"); return resultCount; } else { JobClient jc = new JobClient(job); LOG.info("Submit job in async mode"); lastRunningJob = jc.submitJob(job); LOG.info("Job " + lastRunningJob + " submitted successfully"); return -1; } }
From source file:edu.umn.cs.spatialHadoop.operations.PyramidPlot.java
License:Apache License
/** * Plot a file to a set of images in different zoom levels using a MapReduce * program./*from ww w .ja v a 2 s .co m*/ * @param <S> type of shapes stored in file * @param inFile - Path to the input file(s) * @param outFile - Path to the output file (image) * @param shape - A sample object to be used for parsing input file * @param tileWidth - With of each tile * @param tileHeight - Height of each tile * @param vflip - Set to <code>true</code> to file the whole image vertically * @param color - Color used to draw single shapes * @param numLevels - Number of zoom levels to plot * @throws IOException */ private static <S extends Shape> RunningJob plotMapReduce(Path inFile, Path outFile, OperationsParams params) throws IOException { Color color = params.getColor("color", Color.BLACK); String hdfDataset = (String) params.get("dataset"); Shape shape = hdfDataset != null ? new NASARectangle() : params.getShape("shape"); Shape plotRange = params.getShape("rect"); boolean background = params.is("background"); JobConf job = new JobConf(params, PyramidPlot.class); job.setJobName("PlotPyramid"); String partition = job.get("partition", "space").toLowerCase(); if (partition.equals("space")) { job.setMapperClass(SpacePartitionMap.class); job.setReducerClass(SpacePartitionReduce.class); job.setMapOutputKeyClass(TileIndex.class); job.setMapOutputValueClass(shape.getClass()); job.setInputFormat(ShapeInputFormat.class); } else { job.setMapperClass(DataPartitionMap.class); job.setReducerClass(DataPartitionReduce.class); job.setMapOutputKeyClass(TileIndex.class); job.setMapOutputValueClass(ImageWritable.class); job.setInputFormat(ShapeArrayInputFormat.class); } job.setInt("color", color.getRGB()); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); if (shape instanceof Point && job.getBoolean("sample", false)) { // Enable adaptive sampling int imageWidthRoot = job.getInt("tilewidth", 256); int imageHeightRoot = job.getInt("tileheight", 256); long recordCount = FileMBR.fileMBR(inFile, params).recordCount; float sampleRatio = params.getFloat(GeometricPlot.AdaptiveSampleFactor, 1.0f) * imageWidthRoot * imageHeightRoot / recordCount; job.setFloat(GeometricPlot.AdaptiveSampleRatio, sampleRatio); } Rectangle fileMBR; if (hdfDataset != null) { // Input is HDF job.set(HDFRecordReader.DatasetName, hdfDataset); job.setBoolean(HDFRecordReader.SkipFillValue, true); job.setClass("shape", NASARectangle.class, Shape.class); // Determine the range of values by opening one of the HDF files Aggregate.MinMax minMax = Aggregate.aggregate(new Path[] { inFile }, params); job.setInt(MinValue, minMax.minValue); job.setInt(MaxValue, minMax.maxValue); //fileMBR = new Rectangle(-180, -90, 180, 90); fileMBR = plotRange != null ? plotRange.getMBR() : new Rectangle(-180, -140, 180, 169); // job.setClass(HDFRecordReader.ProjectorClass, MercatorProjector.class, // GeoProjector.class); } else { fileMBR = FileMBR.fileMBR(inFile, params); } boolean keepAspectRatio = params.is("keep-ratio", true); if (keepAspectRatio) { // Expand input file to a rectangle for compatibility with the pyramid // structure if (fileMBR.getWidth() > fileMBR.getHeight()) { fileMBR.y1 -= (fileMBR.getWidth() - fileMBR.getHeight()) / 2; fileMBR.y2 = fileMBR.y1 + fileMBR.getWidth(); } else { fileMBR.x1 -= (fileMBR.getHeight() - fileMBR.getWidth() / 2); fileMBR.x2 = fileMBR.x1 + fileMBR.getHeight(); } } SpatialSite.setRectangle(job, InputMBR, fileMBR); // Set input and output ShapeInputFormat.addInputPath(job, inFile); if (plotRange != null) { job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class); } job.setOutputFormat(PyramidOutputFormat.class); TextOutputFormat.setOutputPath(job, outFile); job.setOutputCommitter(PlotPyramidOutputCommitter.class); if (background) { JobClient jc = new JobClient(job); return lastSubmittedJob = jc.submitJob(job); } else { return lastSubmittedJob = JobClient.runJob(job); } }
From source file:edu.umn.cs.spatialHadoop.operations.Skyline.java
License:Open Source License
private static void skylineMapReduce(Path inFile, Path userOutPath, OperationsParams params) throws IOException { JobConf job = new JobConf(params, Skyline.class); Path outPath = userOutPath;//from w ww.j ava 2 s. co m FileSystem outFs = (userOutPath == null ? inFile : userOutPath).getFileSystem(job); Shape shape = params.getShape("shape"); if (outPath == null) { do { outPath = new Path(inFile.toUri().getPath() + ".skyline_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outPath)); } job.setJobName("Skyline"); job.setClass(SpatialSite.FilterClass, SkylineFilter.class, BlockFilter.class); job.setMapperClass(IdentityMapper.class); job.setCombinerClass(SkylineReducer.class); job.setReducerClass(SkylineReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(shape.getClass()); job.setInputFormat(ShapeIterInputFormat.class); ShapeInputFormat.addInputPath(job, inFile); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outPath); JobClient.runJob(job); // If outputPath not set by user, automatically delete it if (userOutPath == null) outFs.delete(outPath, true); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
private static void setupPipesJob(JobConf conf) throws IOException { // default map output types to Text if (!getIsJavaMapper(conf)) { conf.setMapRunnerClass(PipesMapRunner.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, conf.getPartitionerClass()); conf.setPartitionerClass(PipesPartitioner.class); }/* w w w. j a v a 2s . c o m*/ if (!getIsJavaReducer(conf)) { conf.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { conf.setOutputFormat(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class); conf.setInputFormat(PipesNonJavaInputFormat.class); } String exec = getExecutable(conf); if (exec == null) { throw new IllegalArgumentException("No application program defined."); } // add default debug script only when executable is expressed as // <path>#<executable> if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length + 1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { IOException ie = new IOException("Problem parsing execable URI " + exec); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
From source file:org.apache.avro.mapred.AvroJob.java
License:Apache License
/** Configure a job's data model implementation class. */ public static void setDataModelClass(JobConf job, Class<? extends GenericData> modelClass) { job.setClass(CONF_DATA_MODEL, modelClass, GenericData.class); }