List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:cascading.scheme.TextLine.java
License:Open Source License
@Override public void sinkInit(Tap tap, JobConf conf) throws IOException { if (tap.getQualifiedPath(conf).toString().endsWith(".zip")) throw new IllegalStateException("cannot write zip files: " + FileOutputFormat.getOutputPath(conf)); if (getSinkCompression() == Compress.DISABLE) conf.setBoolean("mapred.output.compress", false); else if (getSinkCompression() == Compress.ENABLE) conf.setBoolean("mapred.output.compress", true); conf.setOutputKeyClass(Text.class); // be explicit conf.setOutputValueClass(Text.class); // be explicit conf.setOutputFormat(TextOutputFormat.class); }
From source file:cascading.tap.hadoop.HadoopMR1TapPlatformTest.java
License:Open Source License
@Test public void testCombinedHfs() throws Exception { getPlatform().copyFromLocal(inputFileLower); getPlatform().copyFromLocal(inputFileUpper); Hfs sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), InputData.inputFileLower); Hfs sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), InputData.inputFileUpper); // create a CombinedHfs instance on these files Tap source = new MultiSourceTap<Hfs, JobConf, RecordReader>(sourceLower, sourceUpper); FlowProcess<JobConf> process = getPlatform().getFlowProcess(); JobConf conf = process.getConfigCopy(); // set the combine flag conf.setBoolean(HfsProps.COMBINE_INPUT_FILES, true); conf.set("cascading.flow.platform", "hadoop"); // only supported on mr based platforms // test the input format and the split source.sourceConfInit(process, conf); InputFormat inputFormat = conf.getInputFormat(); assertEquals(Hfs.CombinedInputFormat.class, inputFormat.getClass()); InputSplit[] splits = inputFormat.getSplits(conf, 1); assertEquals(1, splits.length);/* w w w. j a v a 2 s . com*/ validateLength(source.openForRead(process), 10); }
From source file:com.alexholmes.hadooputils.sort.Sort.java
License:Apache License
/** * The driver for the sort MapReduce job. * * @param jobConf sort configuration * @param numMapTasks number of map tasks * @param numReduceTasks number of reduce tasks * @param sampler sampler, if required * @param codecClass the compression codec for compressing final outputs * @param mapCodecClass the compression codec for compressing intermediary map outputs * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes * for the job output files * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws IOException if something went wrong * @throws URISyntaxException if a URI wasn't correctly formed *//* w w w .ja v a 2 s. com*/ public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass, final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException { jobConf.setJarByClass(Sort.class); jobConf.setJobName("sorter"); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (numMapTasks != null) { jobConf.setNumMapTasks(numMapTasks); } if (numReduceTasks != null) { jobConf.setNumReduceTasks(numReduceTasks); } else { int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sort.reduces_per_host"); if (sortReduces != null) { numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(numReduces); } jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(SortReduce.class); jobConf.setInputFormat(SortInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); if (mapCodecClass != null) { jobConf.setMapOutputCompressorClass(mapCodecClass); } if (codecClass != null) { jobConf.setBoolean("mapred.output.compress", true); jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class); } FileInputFormat.setInputPaths(jobConf, inputDirAsString); FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString)); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; FileSystem fileSystem = FileSystem.get(jobConf); if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) { inputDir = inputDir.getParent(); } inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + jobConf.getNumReduceTasks() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); if (jobResult.isSuccessful()) { if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) { new LzoIndexer(jobConf).index(new Path(outputDirAsString)); } return true; } return false; }
From source file:com.alexholmes.hadooputils.sort.SortInputFormat.java
License:Apache License
@Override public void configure(final JobConf conf) { super.configure(conf); // by default the DeprecatedLzoTextInputFormat.listStatus will ignore // files that don't end in ".lzo". since we want to work with any file // we turn this feature off ////from w w w. j a v a2 s . c o m conf.setBoolean(LzoInputFormatCommon.IGNORE_NONLZO_KEY, false); }
From source file:com.benchmark.mapred.terasort.TeraOutputFormat.java
License:Apache License
/** * Set the requirement for a final sync before the stream is closed. *//*from www .j a v a2 s.c o m*/ public static void setFinalSync(JobConf conf, boolean newValue) { conf.setBoolean(FINAL_SYNC_ATTRIBUTE, newValue); }
From source file:com.chriscx.mapred.Driver.java
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), Driver.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if ("-skip".equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); conf.setBoolean("wordcount.skip.patterns", true); } else {/*w ww. java 2 s . c om*/ other_args.add(args[i]); } } FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.chriscx.matching.Driver.java
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), com.chriscx.mapred.Driver.class); conf.setJobName("Matching"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if ("-skip".equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); conf.setBoolean("wordcount.skip.patterns", true); } else {/*from w ww .j av a 2 s. c o m*/ other_args.add(args[i]); } } FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.cloudera.ByteCount.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(new Configuration()); // Trim off the hadoop-specific args String[] remArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // Pull in properties Options options = new Options(); Option property = OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("D"); options.addOption(property);//from w ww . j ava 2s. c o m Option skipChecksums = new Option("skipChecksums", "skip checksums"); options.addOption(skipChecksums); Option profile = new Option("profile", "profile tasks"); options.addOption(profile); CommandLineParser parser = new BasicParser(); CommandLine line = parser.parse(options, remArgs); Properties properties = line.getOptionProperties("D"); for (Entry<Object, Object> prop : properties.entrySet()) { conf.set(prop.getKey().toString(), prop.getValue().toString()); System.out.println("Set config key " + prop.getKey() + " to " + prop.getValue()); } if (line.hasOption("skipChecksums")) { conf.setBoolean("bytecount.skipChecksums", true); System.out.println("Skipping checksums"); } if (line.hasOption("profile")) { conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples,depth=100,interval=1ms,lineno=y,thread=y,file=%s"); conf.set(MRJobConfig.NUM_MAP_PROFILES, "0"); conf.set("mapred.task.profile.maps", "1"); System.out.println("Profiling map tasks"); } // Get the positional arguments out remArgs = line.getArgs(); if (remArgs.length != 2) { System.err.println("Usage: ByteCount <inputBase> <outputBase>"); System.exit(1); } String inputBase = remArgs[0]; String outputBase = remArgs[1]; Job job = Job.getInstance(conf); job.setInputFormatClass(ByteBufferInputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(ByteCountMapper.class); job.setReducerClass(ByteCountReducer.class); job.setCombinerClass(ByteCountReducer.class); job.setOutputKeyClass(ByteWritable.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(inputBase)); FileOutputFormat.setOutputPath(job, new Path(outputBase)); job.setJarByClass(ByteCount.class); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); System.out.println("\tRead counters"); printCounter(counters, READ_COUNTER.BYTES_READ); printCounter(counters, READ_COUNTER.LOCAL_BYTES_READ); printCounter(counters, READ_COUNTER.SCR_BYTES_READ); printCounter(counters, READ_COUNTER.ZCR_BYTES_READ); System.exit(success ? 0 : 1); }
From source file:com.cloudera.recordservice.avro.mapred.ColorCount.java
License:Apache License
/** * Run the MR1 color count with generic records, and return a map of favorite colors to * the number of users.//from w ww .j a v a 2 s. c om */ public static java.util.Map<String, Integer> countColors() throws IOException { String output = TestUtil.getTempDirectory(); Path outputPath = new Path(output); JobConf conf = new JobConf(ColorCount.class); conf.setJobName("MR1 Color Count With Generic Records"); conf.setInt("mapreduce.job.reduces", 1); conf.setBoolean(com.cloudera.recordservice.avro.AvroJob.USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, true); com.cloudera.recordservice.avro.AvroJob.setInputFormat(conf, org.apache.avro.mapred.AvroInputFormat.class); RecordServiceConfig.setInputTable(conf, "rs", "users"); FileOutputFormat.setOutputPath(conf, outputPath); AvroJob.setMapperClass(conf, Map.class); AvroJob.setReducerClass(conf, Reduce.class); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT))); JobClient.runJob(conf); // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. SeekableInput input = new FsInput(new Path(output + "/part-00000.avro"), conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>(); for (GenericRecord datum : fileReader) { colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString())); } return colorMap; }
From source file:com.cloudera.recordservice.examples.mapreduce.MapredColorCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { org.apache.log4j.BasicConfigurator.configure(); if (args.length != 2) { System.err.println("Usage: MapredColorCount <input path> <output path>"); return -1; }/*from ww w. j a v a2 s . c o m*/ JobConf conf = new JobConf(getConf(), MapredColorCount.class); conf.setJobName("colorcount With Generic Records"); // RECORDSERVICE: // By using the recordservice AvroJob utility, we can configure at run time to // switch between using the recordservice or not. // In this example, we'll set the conf to true to enable the RecordService.. conf.setBoolean(com.cloudera.recordservice.avro.AvroJob.USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, true); com.cloudera.recordservice.avro.AvroJob.setInputFormat(conf, org.apache.avro.mapred.AvroInputFormat.class); // RECORDSERVICE: // To read from a table instead of a path, comment out setInputPaths and instead use: RecordServiceConfig.setInputTable(conf, "rs", "users"); //FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); AvroJob.setMapperClass(conf, ColorCountMapper.class); AvroJob.setReducerClass(conf, ColorCountReducer.class); // Note that AvroJob.setOutputSchema set relevant config options such as output // format, map output classes, and output key class. // Do not need to setInputSchema when using Generic Records. AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT))); JobClient.runJob(conf); return 0; }