List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:cmd.sampler.java
License:Apache License
/** * Driver for InputSampler from the command line. Configures a JobConf * instance and calls {@link #writePartitionFile}. *///from w w w . j ava 2 s .c o m public int run(String[] args) throws Exception { Job job = new Job(getConf()); ArrayList<String> otherArgs = new ArrayList<String>(); Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-inFormat".equals(args[i])) { job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class)); } else if ("-keyClass".equals(args[i])) { job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class)); } else if ("-splitSample".equals(args[i])) { int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new SplitSampler<K, V>(numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (job.getNumReduceTasks() <= 1) { System.err.println("Sampler requires more than one reducer"); return printUsage(); } if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } if (null == sampler) { sampler = new SplitSampler<K, V>(1000, 10); } Path outf = new Path(otherArgs.remove(otherArgs.size() - 1)); TotalOrderPartitioner.setPartitionFile(getConf(), outf); for (String s : otherArgs) { FileInputFormat.addInputPath(job, new Path(s)); } InputSampler.<K, V>writePartitionFile(job, sampler); return 0; }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.DBReader.java
public static void main(String[] args) throws Exception { Path crawlPath = new Path("task2"); Path currentPath = new Path(crawlPath, "crawldb/current"); Path output = new Path("output"); Configuration config = CrawlerConfiguration.create(); FileSystem fs = FileSystem.get(config); if (fs.exists(output)) { fs.delete(output);/*from w ww .ja v a 2 s . c o m*/ } Job job = new Job(config); job.setJobName("dbreader " + crawlPath.toString()); job.setMapperClass(DBReaderMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, currentPath); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Generator.java
public static String generate(Path crawlPath, Configuration conf) throws Exception { SegmentUtil.initSegments(crawlPath, conf); String segmentName = SegmentUtil.createSegment(crawlPath, conf); Path currentPath = new Path(crawlPath, "crawldb/current"); Path generatePath = new Path(crawlPath, "segments/" + segmentName + "/generate"); Job job = new Job(conf); job.setJobName("generate " + crawlPath.toString()); job.setJarByClass(Generator.class); job.setReducerClass(GeneratorReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileInputFormat.addInputPath(job, currentPath); FileOutputFormat.setOutputPath(job, generatePath); job.waitForCompletion(true);/*from ww w . j a va 2s . c o m*/ long count = job.getCounters().findCounter("generator", "count").getValue(); System.out.println("total generate:" + count); if (count == 0) { return null; } else { return segmentName; } }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java
public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName) throws Exception { Job job = new Job(conf); job.setJobName(jobName + " " + crawlPath.toString()); job.setJarByClass(Merge.class); // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path crawldbPath = new Path(crawlPath, "crawldb"); Path newdb = new Path(crawldbPath, "new"); Path currentdb = new Path(crawldbPath, "current"); FileSystem fs = FileSystem.get(conf); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }//from w w w . ja va2 s . c o m if (fs.exists(newdb)) { fs.delete(newdb); } for (Path mergePath : mergePaths) { FileInputFormat.addInputPath(job, mergePath); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java
public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception { Path segmentPath = new Path(crawlPath, "segments/" + segmentName); Path generatePath = new Path(segmentPath, "generate"); Job job = new Job(conf); job.setJobName("fetch " + crawlPath.toString()); job.setJarByClass(Fetcher.class); job.setReducerClass(FetcherReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(FetcherOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); FileInputFormat.addInputPath(job, generatePath); FileOutputFormat.setOutputPath(job, segmentPath); job.waitForCompletion(true);/* w ww . j av a2 s.c o m*/ }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static Job createJob(Configuration conf, Path crawldb) throws IOException { Job job = new Job(conf); //job.setJarByClass(Merge.class); job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }/*from w w w. j a v a2 s . c o m*/ if (fs.exists(newdb)) { fs.delete(newdb); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job; }
From source file:co.cask.cdap.hive.datasets.DatasetInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) { try {/* w w w . j a v a 2 s.c o m*/ datasetAccessor.initialize(); } catch (Exception e) { throw new IOException("Could not get dataset", e); } try (RecordScannable recordScannable = datasetAccessor.getDataset()) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<Split> dsSplits = recordScannable.getSplits(); InputSplit[] inputSplits = new InputSplit[dsSplits.size()]; for (int i = 0; i < dsSplits.size(); i++) { inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]); } return inputSplits; } } }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = new ArrayList<>(); Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration()); for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) { String inputName = mapperInputEntry.getKey(); MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue(); String mapperClassName = mapperInput.getMapperClassName(); Job jobCopy = new Job(job.getConfiguration()); Configuration confCopy = jobCopy.getConfiguration(); // set configuration specific for this input onto the jobCopy ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy); Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName()); Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName()); InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy); for (InputSplit split : formatSplits) { splits.add(new TaggedInputSplit(inputName, split, confCopy, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName)); }//w w w . java 2 s. c om } return splits; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultiInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = new ArrayList<>(); Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration()); for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) { String inputName = mapperInputEntry.getKey(); MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue(); String mapperClassName = mapperInput.getMapperClassName(); Job jobCopy = new Job(job.getConfiguration()); Configuration confCopy = jobCopy.getConfiguration(); // set configuration specific for this input onto the jobCopy ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy); Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName()); Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName()); InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy); //some input format need a jobId to getSplits jobCopy.setJobID(new JobID(inputName, inputName.hashCode())); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a MultiInputTaggedSplit. List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy); for (InputSplit split : formatSplits) { splits.add(new MultiInputTaggedSplit(split, confCopy, inputName, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName)); }/* w ww. j a v a 2s . co m*/ } return splits; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitionerWriterWrapper.java
License:Apache License
private TaskAttemptContext getTaskAttemptContext(TaskAttemptContext context, String newOutputName) throws IOException { Job job = new Job(context.getConfiguration()); DynamicPartitioningOutputFormat.setOutputName(job, newOutputName); // CDAP-4806 We must set this parameter in addition to calling FileOutputFormat#setOutputName, because // AvroKeyOutputFormat/AvroKeyValueOutputFormat use a different parameter for the output name than FileOutputFormat. if (isAvroOutputFormat(getFileOutputFormat(context))) { job.getConfiguration().set("avro.mo.config.namedOutput", newOutputName); }/* w w w. j a v a 2s.co m*/ Path jobOutputPath = DynamicPartitioningOutputFormat .createJobSpecificPath(FileOutputFormat.getOutputPath(job), context); DynamicPartitioningOutputFormat.setOutputPath(job, jobOutputPath); return new TaskAttemptContextImpl(job.getConfiguration(), context.getTaskAttemptID()); }