List of usage examples for org.apache.hadoop.mapreduce Job setMapperClass
public void setMapperClass(Class<? extends Mapper> cls) throws IllegalStateException
From source file:com.ifeng.sorter.LogSortDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from w w w . ja v a 2 s . com*/ log.info("Input: {} , Outpu: {}", args[0], args[1]); Job job = Job.getInstance(super.getConf()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(LogSortMapper.class); job.setReducerClass(LogSortReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.iparea.parser.IPAreaDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());/*from w ww.j a v a 2s . co m*/ FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(IPAreaMapper.class); job.setReducerClass(IPAreaReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.iparea.parser.IPAreaLocalDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());//from ww w . j a v a 2 s . co m FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(IPAreaMapper.class); job.setReducerClass(IPAreaReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.loggroup.mapper.VideologGroupDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }// www . ja va2s. c o m Job job = Job.getInstance(super.getConf()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogGroupMapper.class); job.setReducerClass(VideologGroupReducer.class); job.setCombinerClass(VideologGroupReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.logparser.mapper.VideoLogDriver.java
License:Apache License
@Override public int run(String[] paths) throws Exception { Job job = Job.getInstance(super.getConf()); job.setJarByClass(getClass());// w w w .ja v a 2 s.co m FileInputFormat.addInputPath(job, new Path(paths[0])); FileOutputFormat.setOutputPath(job, new Path(paths[1])); job.setMapperClass(VideoLogMapper.class); job.setReducerClass(VideoLogReducer.class); job.setCombinerClass(VideoLogReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.parser.VideoLogParseDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());/*from w ww . java2 s . c om*/ FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogParseMapper.class); job.setReducerClass(VideoLogParseReducer.class); job.setCombinerClass(VideoLogParseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.parser.VideoLogParseLocalDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());//from w ww . j a va2 s .c o m FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogParseMapper.class); job.setReducerClass(VideoLogParseReducer.class); job.setCombinerClass(VideoLogParseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.videolog.sort.SortGroupResultPreprocessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }// ww w .j av a 2 s . com Job job = Job.getInstance(getConf()); job.setMapperClass(SortGroupResultMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // Sort data by total number: job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public static Job createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = AvroJob.getAvroJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(config).exists(current)) { FileInputFormat.addInputPath(job, current); }//from w w w.j a v a 2 s . c om job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setReducerClass(CrawlDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); return job; }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from w ww. ja va 2 s . co m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }