List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(Configuration conf, Class exampleClass)
From source file:com.github.gaoyangthu.demo.mapred.Grep.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; }/*w ww . j av a 2s. c om*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf grepJob = new JobConf(getConf(), Grep.class); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.set("mapred.mapper.regex", args[2]); if (args.length == 4) grepJob.set("mapred.mapper.regex.group", args[3]); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormat(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); JobClient.runJob(grepJob); JobConf sortJob = new JobConf(Grep.class); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormat(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setOutputKeyComparatorClass // sort by decreasing freq (LongWritable.DecreasingComparator.class); JobClient.runJob(sortJob); } finally { FileSystem.get(grepJob).delete(tempDir, true); } return 0; }
From source file:com.hadoopilluminated.examples.Grep.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; }/* w ww .ja v a 2 s . com*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf grepJob = new JobConf(getConf(), Grep.class); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.set("mapred.mapper.regex", args[2]); if (args.length == 4) { grepJob.set("mapred.mapper.regex.group", args[3]); } grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormat(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); JobClient.runJob(grepJob); JobConf sortJob = new JobConf(getConf(), Grep.class); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormat(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setOutputKeyComparatorClass // sort by decreasing freq (LongWritable.DecreasingComparator.class); JobClient.runJob(sortJob); } finally { FileSystem.get(grepJob).delete(tempDir, true); } return 0; }
From source file:com.hadoopilluminated.examples.Join.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//from w w w . j av a 2s .c om * * @throws IOException When there is communication problems with the job * tracker. */ @Override public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("join"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = cluster.getTaskTrackers() * jobConf.getInt("test.sort.maps_per_host", 10); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = TupleWritable.class; String op = "inner"; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { num_maps = Integer.parseInt(args[++i]); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-joinOp".equals(args[i])) { op = args[++i]; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumMapTasks(num_maps); jobConf.setNumReduceTasks(num_reduces); if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.remove(otherArgs.size() - 1))); List<Path> plist = new ArrayList<Path>(otherArgs.size()); for (String s : otherArgs) { plist.add(new Path(s)); } jobConf.setInputFormat(CompositeInputFormat.class); jobConf.set("mapred.join.expr", CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0]))); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException { job = new JobConf(getConf(), Crush.class); /*/*from w ww . jav a2 s . c o m*/ * Turn off speculative execution because that's just wasting network io. */ job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); /* * Turn off pre-emption because we don't want to kill a task after two hours of network io. */ job.set("mapred.fairscheduler.preemption", "false"); tmpDir = new Path("tmp/crush-" + UUID.randomUUID()); outDir = new Path(tmpDir, "out"); double threshold = 0.75; List<String> regexes = asList(".+"); List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); List<String> inFormats = asList(SequenceFileInputFormat.class.getName()); List<String> outFormats = asList(SequenceFileOutputFormat.class.getName()); String crushTimestamp; Options options = buildOptions(); CommandLine cli = new GnuParser().parse(options, args); if (cli.hasOption("?")) { BufferedReader reader = new BufferedReader( new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt"))); try { String line; while (null != (line = reader.readLine())) { System.out.println(line); } } finally { reader.close(); } return false; } if (cli.hasOption("verbose")) { console = Verbosity.VERBOSE; } else if (cli.hasOption("info")) { console = Verbosity.INFO; } else { console = Verbosity.NONE; } if (cli.hasOption("ignore-regex")) { ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher(""); } excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs"); String[] nonOptions = cli.getArgs(); if (2 == nonOptions.length) { /* * Stand alone mode accepts two arguments. */ mode = Mode.STAND_ALONE; srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValue("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValue("output-format")); } replacements = asList(dest.getName()); crushTimestamp = Long.toString(currentTimeMillis()); } else { /* * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much * smaller. */ if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length && args[2].length() != 14) { int maxTasks = Integer.parseInt(args[2]); if (maxTasks <= 0 || maxTasks > 4000) { throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks); } job.setInt("mapred.reduce.tasks", maxTasks); maxFileBlocks = Integer.MAX_VALUE; crushTimestamp = Long.toString(currentTimeMillis()); srcDir = new Path(args[0]); dest = new Path(args[1]); mode = Mode.CLONE; if (args.length == 4) { if (args[3].equals("TEXT")) { /* * These are the defaults except with text input and output formats. */ inFormats = asList(TextInputFormat.class.getName()); outFormats = asList(TextOutputFormat.class.getName()); } else if (!args[3].equals("SEQUENCE")) { throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]); } } } else { /* * V2 style arguments. */ if (cli.hasOption("threshold")) { threshold = Double.parseDouble(cli.getOptionValue("threshold")); if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold) || Double.isNaN(threshold)) { throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold); } } if (cli.hasOption("max-file-blocks")) { int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks")); if (0 > maxFileBlocksOption) { throw new IllegalArgumentException( "Maximum file size in blocks must be positive: " + maxFileBlocksOption); } maxFileBlocks = maxFileBlocksOption; } else { maxFileBlocks = 8; } if (cli.hasOption("regex")) { regexes = asList(cli.getOptionValues("regex")); } if (cli.hasOption("replacement")) { replacements = asList(cli.getOptionValues("replacement")); } if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValues("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValues("output-format")); } if (3 != nonOptions.length) { throw new IllegalArgumentException( "Could not find source directory, out directory, and job timestamp"); } srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); crushTimestamp = nonOptions[2]; if (cli.hasOption("clone")) { mode = Mode.CLONE; } else { mode = Mode.MAP_REDUCE; } if (!crushTimestamp.matches("\\d{14}")) { throw new IllegalArgumentException( "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp); } } dfsBlockSize = parseDfsBlockSize(job); maxEligibleSize = (long) (dfsBlockSize * threshold); } /* * Add the crush specs and compression options to the configuration. */ job.set("crush.timestamp", crushTimestamp); if (ignoredFiles != null) { job.set("crush.ignore-regex", ignoredFiles.pattern().pattern()); } if (regexes.size() != replacements.size() || replacements.size() != inFormats.size() || inFormats.size() != outFormats.size()) { throw new IllegalArgumentException( "Must be an equal number of regex, replacement, in-format, and out-format options"); } job.setInt("crush.num.specs", regexes.size()); matchers = new ArrayList<Matcher>(regexes.size()); for (int i = 0; i < regexes.size(); i++) { job.set(format("crush.%d.regex", i), regexes.get(i)); matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy")); job.set(format("crush.%d.regex.replacement", i), replacements.get(i)); String inFmt = inFormats.get(i); if ("sequence".equals(inFmt)) { inFmt = SequenceFileInputFormat.class.getName(); } else if ("text".equals(inFmt)) { inFmt = TextInputFormat.class.getName(); } else { try { if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } job.set(format("crush.%d.input.format", i), inFmt); String outFmt = outFormats.get(i); if ("sequence".equals(outFmt)) { outFmt = SequenceFileOutputFormat.class.getName(); } else if ("text".equals(outFmt)) { outFmt = TextOutputFormat.class.getName(); } else { try { if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } job.set(format("crush.%d.output.format", i), outFmt); } String codec = cli.getOptionValue("compress"); if (null == codec) { codec = DefaultCodec.class.getName(); } else if ("none".equals(codec)) { codec = null; } else if ("gzip".equals(codec)) { codec = GzipCodec.class.getName(); } else { try { if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } if (null == codec) { job.setBoolean("mapred.output.compress", false); } else { job.setBoolean("mapred.output.compress", true); job.set("mapred.output.compression.type", "BLOCK"); job.set("mapred.output.compression.codec", codec); try { CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance(); codecExtension = instance.getDefaultExtension(); } catch (Exception e) { throw new AssertionError(); } } return true; }
From source file:com.hp.hplc.mr.driver.WordCount.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//from w w w.ja va 2 s . co m */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCount.class); conf.setJobName("wordcount"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); System.out.println("# of reduces: " + conf.getNumReduceTasks()); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }