List of usage examples for org.apache.hadoop.mapred JobConf setOutputValueClass
public void setOutputValueClass(Class<?> theClass)
From source file:com.benchmark.mapred.RandomTextWriter.java
License:Apache License
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything./*from w w w. ja va 2s . c o m*/ * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { return printUsage(); } JobConf job = new JobConf(getConf()); job.setJarByClass(RandomTextWriter.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(RandomWriter.RandomInputFormat.class); job.setMapperClass(Map.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite); } Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else { otherArgs.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } job.setOutputFormat(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0))); job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.benchmark.mapred.RandomWriter.java
License:Apache License
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything.//from ww w .j a va 2 s . c o m * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.benchmark.mapred.Sort.java
License:Apache License
/** * The main driver for sort program./*from w w w . ja v a 2 s .c om*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.benchmark.mapred.terasort.TeraGen.java
License:Apache License
/** * @param args the cli arguments//from w ww . jav a2s.c o m */ public int run(String[] args) throws IOException { JobConf job = (JobConf) getConf(); setNumberOfRows(job, Long.parseLong(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setNumMapTasks(50); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(RangeInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.benchmark.mapred.terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); if (args.length != 3) { System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 3."); }// w w w .ja v a 2s. c o m inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setNumReduceTasks(Integer.parseInt(args[2])); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); Date startIteration = new Date(); JobClient.runJob(job); Date endIteration = new Date(); System.out.println( "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds."); LOG.info("done"); return 0; }
From source file:com.benchmark.mapred.terasort.TeraValidate.java
License:Apache License
public int run(String[] args) throws Exception { JobConf job = (JobConf) getConf(); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1);/*w w w.j a va 2 s . c om*/ // force a single split job.setLong("mapred.min.split.size", Long.MAX_VALUE); job.setInputFormat(TeraInputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.bigdata.diane.MiniTestDFSIO.java
License:Apache License
@SuppressWarnings("deprecation") private static void runIOTest(Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, Path outputDir, Configuration fsConfig) throws IOException { JobConf job = new JobConf(fsConfig, MiniTestDFSIO.class); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass);/*from w w w .j av a2s. c o m*/ job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
From source file:com.bixolabs.cascading.avro.AvroScheme.java
License:Apache License
@SuppressWarnings({ "deprecation" }) @Override/*w ww .ja v a2 s .c om*/ public void sinkInit(Tap tap, JobConf conf) { conf.set(AvroJob.OUTPUT_SCHEMA, getSchema().toString()); conf.setOutputFormat(AvroOutputFormat.class); // Since we're outputting to Avro, we need to set up output values. // TODO KKr - why don't we need to set the OutputValueClass? // TODO KKr - why do we need to set the OutputKeyComparatorClass? conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(AvroWrapper.class); conf.setOutputKeyComparatorClass(AvroKeyComparator.class); // conf.setMapOutputKeyClass(AvroKey.class); // conf.setMapOutputValueClass(AvroValue.class); // add AvroSerialization to io.serializations // Collection<String> serializations = conf.getStringCollection("io.serializations"); // if (!serializations.contains(AvroSerialization.class.getName())) { // serializations.add(AvroSerialization.class.getName()); // conf.setStrings("io.serializations", serializations.toArray(new String[0])); // } // Class<? extends Mapper> mapClass = conf.getMapperClass(); // Class<? extends Reducer> reduceClass = conf.getReducerClass(); // AvroJob.setOutputSchema(conf, getSchema()); // conf.setMapperClass(mapClass); // conf.setReducerClass(reduceClass); LOGGER.info(String.format("Initializing Avro scheme for sink tap - scheme fields: %s", _schemeFields)); }
From source file:com.chriscx.mapred.Driver.java
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), Driver.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if ("-skip".equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); conf.setBoolean("wordcount.skip.patterns", true); } else {// w w w.j av a 2 s .c om other_args.add(args[i]); } } FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.chriscx.matching.Driver.java
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), com.chriscx.mapred.Driver.class); conf.setJobName("Matching"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if ("-skip".equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); conf.setBoolean("wordcount.skip.patterns", true); } else {//from ww w .j av a 2 s .c o m other_args.add(args[i]); } } FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }