List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:com.ostor.dedup.hadoop.DedupStorHadoopCreateSegmentsMapReduce.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("NOTE: Setting up logs from conf file - " + DedupStor.DEFAULT_LOG4J_FILE); PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE); JobConf conf = new JobConf(DedupStorHadoopCreateSegmentsMapReduce.class); conf.setJobName("dedup-create-segments"); conf.setMapOutputKeyClass(DedupHashWritable.class); conf.setMapOutputValueClass(DedupObjectSegmentCompleteWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DedupObjectSegmentWritable.class); conf.setMapperClass(DedupStorHadoopCreateSegmentsMapper.class); conf.setReducerClass(DedupStorHadoopCreateSegmentsReducer.class); conf.setInputFormat(DedupObjectInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); logger.info("Set input dir - " + args[0]); logger.info("Set output dir - " + args[1]); Path inputPath = new Path(args[0]); Path segmentStorPath = new Path(args[1], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_SEGMENTS_LOC_SUFFIX); Path objectMapPath = new Path(args[1], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH); conf.set(DedupStorHadoopUtils.HADOOP_CONF_SEGMENTS_STOR_PATH_KEY, segmentStorPath.toString()); conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_TMP_PATH_KEY, objectMapPath.toString()); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, objectMapPath); JobClient.runJob(conf);//from w w w . java 2 s. co m }
From source file:com.pinterest.hdfsbackup.distcp.DistCp.java
License:Apache License
private static JobConf createJobConf(Configuration conf) { JobConf jobconf = new JobConf(conf, DistCp.class); jobconf.setJobName(NAME);/*from ww w . j a v a2s . com*/ // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobconf.setMapSpeculativeExecution(false); jobconf.setInputFormat(CopyInputFormat.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(Text.class); jobconf.setMapperClass(CopyFilesMapper.class); jobconf.setNumReduceTasks(0); return jobconf; }
From source file:com.rapleaf.hank.hadoop.HadoopDomainBuilder.java
License:Apache License
public static final JobConf createJobConfiguration(String inputPath, Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass, int versionNumber, DomainBuilderProperties properties) { JobConf conf = new JobConf(); // Input specification conf.setInputFormat(inputFormatClass); FileInputFormat.setInputPaths(conf, inputPath); // Mapper class and key/value classes conf.setMapperClass(mapperClass);//from www .j av a 2 s . com conf.setMapOutputKeyClass(KeyAndPartitionWritableComparable.class); conf.setMapOutputValueClass(ValueWritable.class); // Reducer class and key/value classes conf.setReducerClass(DomainBuilderReducer.class); conf.setOutputKeyClass(KeyAndPartitionWritable.class); conf.setOutputValueClass(ValueWritable.class); // Output format conf.setOutputFormat(properties.getOutputFormatClass()); // Output path (set to tmp output path) FileOutputFormat.setOutputPath(conf, new Path(properties.getTmpOutputPath(versionNumber))); // Partitioner conf.setPartitionerClass(DomainBuilderPartitioner.class); // Output Committer conf.setOutputCommitter(DomainBuilderOutputCommitter.class); // Hank specific configuration properties.setJobConfProperties(conf, versionNumber); return conf; }
From source file:com.ricemap.spateDB.operations.FileMBR.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing/* w w w .j ava 2 s . c om*/ * @param conf * @param fs * @param file * @return * @throws IOException */ public static <S extends Shape> Prism fileMBRMapReduce(FileSystem fs, Path file, S stockShape, boolean background) throws IOException { // Quickly get file MBR if it is globally indexed GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(fs, file); if (globalIndex != null) { // Return the MBR of the global index. // Compute file size by adding up sizes of all files assuming they are // not compressed long totalLength = 0; for (Partition p : globalIndex) { Path filePath = new Path(file, p.filename); if (fs.exists(filePath)) totalLength += fs.getFileStatus(filePath).getLen(); } sizeOfLastProcessedFile = totalLength; return globalIndex.getMBR(); } JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(file.toUri().getPath() + ".mbr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("FileMBR"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Prism.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeInputFormat.class); SpatialSite.setShapeClass(job, stockShape.getClass()); job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); job.setOutputCommitter(MBROutputCommitter.class); // Submit the job if (background) { JobClient jc = new JobClient(job); lastSubmittedJob = jc.submitJob(job); return null; } else { lastSubmittedJob = JobClient.runJob(job); Counters counters = lastSubmittedJob.getCounters(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); FileMBR.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Read job result FileStatus[] results = outFs.listStatus(outputPath); Prism mbr = new Prism(); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { mbr.fromText(text); } lineReader.close(); } } outFs.delete(outputPath, true); return mbr; } }
From source file:com.ricemap.spateDB.operations.LineRandomizer.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//from w w w . j av a2 s.c o m * @param conf * @param infs * @param infile * @return * @throws IOException */ public static void randomizerMapReduce(Path infile, Path outfile, boolean overwrite) throws IOException { JobConf job = new JobConf(LineRandomizer.class); FileSystem outfs = outfile.getFileSystem(job); if (overwrite) outfs.delete(outfile, true); job.setJobName("Randomizer"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(Map.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setReducerClass(Reduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); FileSystem infs = infile.getFileSystem(job); int numOfPartitions = (int) Math .ceil((double) infs.getFileStatus(infile).getLen() / infs.getDefaultBlockSize(outfile)); job.setInt(NumOfPartitions, numOfPartitions); job.setInputFormat(TextInputFormat.class); TextInputFormat.setInputPaths(job, infile); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outfile); // Submit the job JobClient.runJob(job); }
From source file:com.ricemap.spateDB.operations.Plot.java
License:Apache License
public static <S extends Shape> void plotMapReduce(Path inFile, Path outFile, Shape shape, int width, int height, Color color, boolean showBorders, boolean showBlockCount, boolean showRecordCount, boolean background) throws IOException { JobConf job = new JobConf(Plot.class); job.setJobName("Plot"); job.setMapperClass(PlotMap.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setReducerClass(PlotReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setMapOutputKeyClass(Prism.class); SpatialSite.setShapeClass(job, shape.getClass()); job.setMapOutputValueClass(shape.getClass()); FileSystem inFs = inFile.getFileSystem(job); Prism fileMbr = FileMBR.fileMBRMapReduce(inFs, inFile, shape, false); FileStatus inFileStatus = inFs.getFileStatus(inFile); CellInfo[] cellInfos;/*from www .ja v a 2 s . c o m*/ GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(inFs, inFile); if (gindex == null) { // A heap file. The map function should partition the file GridInfo gridInfo = new GridInfo(fileMbr.t1, fileMbr.x1, fileMbr.y1, fileMbr.t2, fileMbr.x2, fileMbr.y2); gridInfo.calculateCellDimensions(inFileStatus.getLen(), inFileStatus.getBlockSize()); cellInfos = gridInfo.getAllCells(); // Doesn't make sense to show any partition information in a heap file showBorders = showBlockCount = showRecordCount = false; } else { cellInfos = SpatialSite.cellsOf(inFs, inFile); } // Set cell information in the job configuration to be used by the mapper SpatialSite.setCells(job, cellInfos); // Adjust width and height to maintain aspect ratio if ((fileMbr.x2 - fileMbr.x1) / (fileMbr.y2 - fileMbr.y1) > (double) width / height) { // Fix width and change height height = (int) ((fileMbr.y2 - fileMbr.y1) * width / (fileMbr.x2 - fileMbr.x1)); } else { width = (int) ((fileMbr.x2 - fileMbr.x1) * height / (fileMbr.y2 - fileMbr.y1)); } LOG.info("Creating an image of size " + width + "x" + height); ImageOutputFormat.setFileMBR(job, fileMbr); ImageOutputFormat.setImageWidth(job, width); ImageOutputFormat.setImageHeight(job, height); job.setBoolean(ShowBorders, showBorders); job.setBoolean(ShowBlockCount, showBlockCount); job.setBoolean(ShowRecordCount, showRecordCount); job.setInt(StrokeColor, color.getRGB()); // Set input and output job.setInputFormat(ShapeInputFormat.class); ShapeInputFormat.addInputPath(job, inFile); // Set output committer which will stitch images together after all reducers // finish job.setOutputCommitter(PlotOutputCommitter.class); job.setOutputFormat(ImageOutputFormat.class); TextOutputFormat.setOutputPath(job, outFile); if (background) { JobClient jc = new JobClient(job); lastSubmittedJob = jc.submitJob(job); } else { lastSubmittedJob = JobClient.runJob(job); } }
From source file:com.ricemap.spateDB.operations.RangeQuery.java
License:Apache License
/** * Performs a range query using MapReduce * // w w w . ja v a 2 s .co m * @param fs * @param inputFile * @param queryRange * @param shape * @param output * @return * @throws IOException */ public static long rangeQueryMapReduce(FileSystem fs, Path inputFile, Path userOutputPath, Shape queryShape, Shape shape, boolean overwrite, boolean background, QueryInput query) throws IOException { JobConf job = new JobConf(FileMBR.class); FileSystem outFs = inputFile.getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { do { outputPath = new Path( inputFile.toUri().getPath() + ".rangequery_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } else { if (outFs.exists(outputPath)) { if (overwrite) { outFs.delete(outputPath, true); } else { throw new RuntimeException("Output path already exists and -overwrite flag is not set"); } } } job.setJobName("RangeQuery"); job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class); RangeFilter.setQueryRange(job, queryShape); // Set query range for // filter ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(3); // Decide which map function to use depending on how blocks are indexed // And also which input format to use if (SpatialSite.isRTree(fs, inputFile)) { // RTree indexed file LOG.info("Searching an RTree indexed file"); job.setInputFormat(RTreeInputFormat.class); } else { // A file with no local index LOG.info("Searching a non local-indexed file"); job.setInputFormat(ShapeInputFormat.class); } GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inputFile); // if (gIndex != null && gIndex.isReplicated()){ // job.setMapperClass(RangeQueryMap.class); Class<?> OutputKey = NullWritable.class; try { Class<?> c = shape.getClass(); Field f = c.getDeclaredField(query.field); f.setAccessible(true); if (f.getType().equals(Integer.TYPE)) { OutputKey = IntWritable.class; } else if (f.getType().equals(Double.TYPE)) { OutputKey = DoubleWritable.class; } else if (f.getType().equals(Long.TYPE)) { OutputKey = LongWritable.class; } } catch (SecurityException e) { e.printStackTrace(); } catch (NoSuchFieldException e) { // TODO Auto-generated catch block e.printStackTrace(); } job.setMapOutputKeyClass(OutputKey); switch (query.type) { case Distinct: job.setMapperClass(DistinctQueryMap.class); job.setReducerClass(DistinctQueryReduce.class); job.setMapOutputValueClass(NullWritable.class); break; case Distribution: job.setMapperClass(DistributionQueryMap.class); job.setReducerClass(DistributionQueryReduce.class); job.setMapOutputValueClass(IntWritable.class); break; default: break; } // } // else // job.setMapperClass(RangeQueryMapNoDupAvoidance.class); // Set query range for the map function job.set(QUERY_SHAPE_CLASS, queryShape.getClass().getName()); job.set(QUERY_SHAPE, queryShape.toText(new Text()).toString()); job.set(QUERY_FIELD, query.field); // Set shape class for the SpatialInputFormat SpatialSite.setShapeClass(job, shape.getClass()); job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, inputFile); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job if (!background) { RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); // If outputPath not set by user, automatically delete it if (userOutputPath == null) outFs.delete(outputPath, true); return resultCount; } else { JobClient jc = new JobClient(job); lastRunningJob = jc.submitJob(job); return -1; } }
From source file:com.ricemap.spateDB.operations.RecordCount.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing/*from w ww . j a va2s .c o m*/ * @param conf * @param fs * @param file * @return * @throws IOException */ public static long recordCountMapReduce(FileSystem fs, Path file) throws IOException { JobConf job = new JobConf(RecordCount.class); Path outputPath = new Path(file.toUri().getPath() + ".linecount"); FileSystem outFs = outputPath.getFileSystem(job); outFs.delete(outputPath, true); job.setJobName("LineCount"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(1); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); // Read job result long lineCount = 0; FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { lineCount = Long.parseLong(text.toString()); } lineReader.close(); } } outFs.delete(outputPath, true); return lineCount; }
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
/** * Repartitions an input file according to the given list of cells. * @param inFile//from w w w . j a v a2s . c o m * @param outPath * @param cellInfos * @param pack * @param rtree * @param overwrite * @throws IOException */ public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize, CellInfo[] cellInfos, String sindex, boolean overwrite, boolean columnar) throws IOException { JobConf job = new JobConf(Repartition.class); job.setJobName("Repartition"); FileSystem outFs = outPath.getFileSystem(job); // Overwrite output file if (outFs.exists(outPath)) { if (overwrite) outFs.delete(outPath, true); else throw new RuntimeException( "Output file '" + outPath + "' already exists and overwrite flag is not set"); } // Decide which map function to use depending on the type of global index if (sindex.equals("rtree")) { // Repartition without replication job.setMapperClass(RepartitionMapNoReplication.class); } else { // Repartition with replication (grid and r+tree) job.setMapperClass(RepartitionMap.class); } job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(stockShape.getClass()); ShapeInputFormat.setInputPaths(job, inFile); job.setInputFormat(ShapeInputFormat.class); boolean pack = sindex.equals("r+tree"); boolean expand = sindex.equals("rtree"); job.setBoolean(SpatialSite.PACK_CELLS, pack); job.setBoolean(SpatialSite.EXPAND_CELLS, expand); job.setStrings(SpatialSite.STORAGE_MODE, columnar ? "columnar" : "normal"); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); // Set default parameters for reading input file SpatialSite.setShapeClass(job, stockShape.getClass()); FileOutputFormat.setOutputPath(job, outPath); if (sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { // For now, the two types of local index are the same job.setOutputFormat(RTreeGridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } // Copy block size from source file if it's globally indexed FileSystem inFs = inFile.getFileSystem(job); if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile); if (globalIndex != null) { blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename)) .getBlockSize(); LOG.info("Automatically setting block size to " + blockSize); } } if (blockSize != 0) job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blockSize); SpatialSite.setCells(job, cellInfos); job.setBoolean(SpatialSite.OVERWRITE, overwrite); // Set reduce function job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); // Set output committer that combines output files together job.setOutputCommitter(RepartitionOutputCommitter.class); JobClient.runJob(job); }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Sample a ratio of the file through a MapReduce job * @param fs/*from w w w . j a va2 s . c om*/ * @param files * @param ratio * @param threshold - Maximum number of elements to be sampled * @param output * @param inObj * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio( FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("Sample"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setClass(InClass, inObj.getClass(), TextSerializable.class); job.setClass(OutClass, outObj.getClass(), TextSerializable.class); job.setMapperClass(Map.class); job.setLong(RANDOM_SEED, seed); job.setFloat(SAMPLE_RATIO, (float) ratio); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(0); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, files); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job RunningJob run_job = JobClient.runJob(job); Counters counters = run_job.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Ratio of records to return from output based on the threshold // Note that any number greater than or equal to one will cause all // elements to be returned final double selectRatio = (double) threshold / resultCount; // Read job result int result_size = 0; if (output != null) { Text line = new Text(); FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); try { while (lineReader.readLine(line) > 0) { if (Math.random() < selectRatio) { if (output != null) { outObj.fromText(line); output.collect(outObj); } result_size++; } } } catch (RuntimeException e) { e.printStackTrace(); } lineReader.close(); } } } outFs.delete(outputPath, true); return result_size; }