List of usage examples for org.apache.hadoop.mapred JobConf setSpeculativeExecution
public void setSpeculativeExecution(boolean speculativeExecution)
From source file:org.apache.nutch.fetcher.Fetcher.java
License:Apache License
public void fetch(Path segment, int threads) throws IOException { checkConfiguration();/*from w w w . j a v a 2 s. co m*/ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(Fetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.fetcher.Fetcher2.java
License:Apache License
public void fetch(Path segment, int threads, boolean parsing) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting"); LOG.info("Fetcher: segment: " + segment); }/*from w ww . java 2 s .c om*/ JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setBoolean("fetcher.parse", parsing); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(Fetcher2.class); job.setOutputPath(segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FetcherOutput.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); } }
From source file:org.apache.nutch.fetcher.NIOFetcher.java
License:Apache License
public void fetch(Path segment) throws IOException { checkConfiguration();/*from www.j a va 2 s . c o m*/ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("NIOFetcher: starting at " + sdf.format(start)); LOG.info("NIOFetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is // enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("NIOFetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("NIOFetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(FetcherInputFormat.class); job.setMapRunnerClass(NIOFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("NIOFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.fetcher.OldFetcher.java
License:Apache License
public void fetch(Path segment, int threads) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: starting at " + sdf.format(start)); LOG.info("OldFetcher: segment: " + segment); }//www . j a va2s . c om JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(OldFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.indexer.DeleteDuplicates.java
License:Apache License
public void dedup(Path[] indexDirs) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }//from w w w .j a v a 2s. co m Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(getConf()); for (int i = 0; i < indexDirs.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Dedup: adding indexes in: " + indexDirs[i]); } job.addInputPath(indexDirs[i]); } job.setJobName("dedup 1: urls by time"); job.setInputFormat(InputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IndexDoc.class); job.setReducerClass(UrlsReducer.class); job.setOutputPath(outDir1); job.setOutputKeyClass(MD5Hash.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); job = new NutchJob(getConf()); job.setJobName("dedup 2: content by hash"); job.addInputPath(outDir1); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(MD5Hash.class); job.setMapOutputValueClass(IndexDoc.class); job.setPartitionerClass(HashPartitioner.class); job.setSpeculativeExecution(false); job.setReducerClass(HashReducer.class); job.setOutputPath(outDir2); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); // remove outDir1 - no longer needed fs.delete(outDir1); job = new NutchJob(getConf()); job.setJobName("dedup 3: delete from index(es)"); job.addInputPath(outDir2); job.setInputFormat(SequenceFileInputFormat.class); //job.setInputKeyClass(Text.class); //job.setInputValueClass(IndexDoc.class); job.setInt("io.file.buffer.size", 4096); job.setMapperClass(DeleteDuplicates.class); job.setReducerClass(DeleteDuplicates.class); job.setOutputFormat(DeleteDuplicates.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); fs.delete(outDir2); if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); } }
From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java
License:Apache License
public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException { checkConfiguration();/* w ww . j av a2s.c om*/ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); // push the zipped_webdriver binaries onto the DistributedCache DistributedCache.addCacheArchive(new URI(zippedDriverPath), job); job.set("webdriver.binaries.path", zippedDriverPath); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(SeleniumFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.oozie.action.hadoop.LauncherMapperHelper.java
License:Apache License
public static void setupLauncherInfo(JobConf launcherConf, String jobId, String actionId, Path actionDir, String recoveryId, Configuration actionConf, String prepareXML) throws IOException, HadoopAccessorException { launcherConf.setMapperClass(LauncherMapper.class); launcherConf.setSpeculativeExecution(false); launcherConf.setNumMapTasks(1);//from w ww .java 2 s . c o m launcherConf.setNumReduceTasks(0); launcherConf.set(LauncherMapper.OOZIE_JOB_ID, jobId); launcherConf.set(LauncherMapper.OOZIE_ACTION_ID, actionId); launcherConf.set(LauncherMapper.OOZIE_ACTION_DIR_PATH, actionDir.toString()); launcherConf.set(LauncherMapper.OOZIE_ACTION_RECOVERY_ID, recoveryId); launcherConf.set(LauncherMapper.ACTION_PREPARE_XML, prepareXML); actionConf.set(LauncherMapper.OOZIE_JOB_ID, jobId); actionConf.set(LauncherMapper.OOZIE_ACTION_ID, actionId); if (Services.get().getConf().getBoolean("oozie.hadoop-2.0.2-alpha.workaround.for.distributed.cache", false)) { List<String> purgedEntries = new ArrayList<String>(); Collection<String> entries = actionConf.getStringCollection("mapreduce.job.cache.files"); for (String entry : entries) { if (entry.contains("#")) { purgedEntries.add(entry); } } actionConf.setStrings("mapreduce.job.cache.files", purgedEntries.toArray(new String[purgedEntries.size()])); launcherConf.setBoolean("oozie.hadoop-2.0.2-alpha.workaround.for.distributed.cache", true); } FileSystem fs = Services.get().get(HadoopAccessorService.class) .createFileSystem(launcherConf.get("user.name"), actionDir.toUri(), launcherConf); fs.mkdirs(actionDir); OutputStream os = fs.create(new Path(actionDir, LauncherMapper.ACTION_CONF_XML)); try { actionConf.writeXml(os); } finally { IOUtils.closeSafely(os); } launcherConf.setInputFormat(OozieLauncherInputFormat.class); launcherConf.set("mapred.output.dir", new Path(actionDir, "output").toString()); }
From source file:org.gbif.ocurrence.index.solr.ConfTester.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), ConfTester.class); job.setNumMapTasks(numMapper);/*ww w .j a v a 2s . c om*/ job.setNumReduceTasks(numReducer); job.setMapperClass(ConfTester.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(ConfTester.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(ConfTester.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:org.slc.sli.aggregation.mapreduce.map.ConfigurableMapReduceJob.java
License:Apache License
@SuppressWarnings("rawtypes") protected static JobConf finalizeConfig(JobConf jobConf, ConfigSections s) throws IOException { Class<? extends Mapper> mapperClass = JobConfiguration.mapper.getMapClass(mapper); Class<? extends Reducer> reducerClass = JobConfiguration.function.getReduceClass(reduceFunction); Map<String, String> idFields = s.getMapper().getMapIdFields(); // validate we have enough to continue boolean valid = true; if (mapperClass == null) { log.severe("Invalid map/reduce configuration detected : no mapper class specified."); valid = false;/*from w ww . j a v a2 s . c om*/ } if (idFields == null) { idFields = new HashMap<String, String>(); log.severe("Invalid map/reduce configuration detected : no map id fields specified."); valid = false; } if (mapCollection == null) { log.severe("Invalid map/reduce configuration detected : no map collection specified."); valid = false; } if (mapQuery == null) { log.severe("Invalid map/reduce configuration detected : no map query specified."); valid = false; } if (mapFields == null) { log.severe("Invalid map/reduce configuration detected : no map input fields specified."); valid = false; } if (reducerClass == null) { log.severe("Invalid map/reduce configuration detected : no reducer class specified."); valid = false; } if (reduceCollection == null) { log.severe("Invalid map/reduce configuration detected : no reduce collection specified."); valid = false; } if (reduceField == null) { log.severe("Invalid map/reduce configuration detected : no reduce field specified."); valid = false; } if (!valid) { throw new IllegalArgumentException("Invalid mapper specified. Check log for details."); } jobConf.set("mapred.output.dir", String.format("%s-%s-%d", s.getMapper().getMapper(), s.getMetadata().getFunction(), System.currentTimeMillis())); jobConf.setJobName(s.getMetadata().getDescription() == null ? "M/R Job" : s.getMetadata().getDescription()); // enable speculative execution. Multiple mapper tasks are created for the same split. // First one to finish wins; the remaining tasks are terminated. jobConf.setSpeculativeExecution(true); jobConf.setUseNewMapper(true); jobConf.setUseNewReducer(true); /** * TODO -- decide if this is required. String id = conf.get("@ID@"); String tenantId = conf.get("@TENANT_ID@"); for (Map.Entry<String, Object> entry : query.entrySet()) { Object value = entry.getValue(); if (value instanceof String) { String s = (String) value; if (s.indexOf("@ID@") >= 0 && id != null) { s = s.replace("@ID@", id); query.put(entry.getKey(), s); } if (s.indexOf("@TENANT_ID@") >= 0 && tenantId != null) { s = s.replace("@TENANT_ID@", tenantId); query.put(entry.getKey(), s); } } } if (updateField.indexOf("@ID@") >= 0 && id != null) { updateField = updateField.replace("@ID@", id); } if (updateField.indexOf("@TENANT_ID@") >= 0 && tenantId != null) { updateField = updateField.replace("@TENANT_ID@", tenantId); } */ MongoConfigUtil.setQuery(jobConf, new BasicDBObject(mapQuery)); Map<String, Object> fullFields = new HashMap<String, Object>(); for (String f : idFields.values()) { fullFields.put(f, 1); } fullFields.putAll(mapFields); MongoConfigUtil.setFields(jobConf, new BasicDBObject(fullFields)); MongoConfigUtil.setInputKey(jobConf, idFields.get("id")); MongoConfigUtil.setInputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + mapCollection); MongoConfigUtil.setMapperOutputKey(jobConf, TenantAndIdEmittableKey.class); MongoConfigUtil.setMapperOutputValue(jobConf, BSONWritable.class); MongoConfigUtil.setOutputKey(jobConf, TenantAndIdEmittableKey.class); MongoConfigUtil.setOutputValue(jobConf, BSONWritable.class); // TODO - this probably should be configurable MongoConfigUtil.setReadSplitsFromSecondary(jobConf, true); MongoConfigUtil.setSplitSize(jobConf, 32); jobConf.setClass("mapred.input.key.class", TenantAndIdEmittableKey.class, EmittableKey.class); jobConf.setClass("mapred.input.value.class", BSONWritable.class, Object.class); jobConf.setClass("mapred.output.key.class", TenantAndIdEmittableKey.class, EmittableKey.class); jobConf.setClass("mapred.output.value.class", BSONWritable.class, Object.class); jobConf.setClass("mapreduce.inputformat.class", MongoTenantAndIdInputFormat.class, MongoInputFormat.class); jobConf.setClass("mapreduce.outputformat.class", MongoAggFormatter.class, MongoOutputFormat.class); MongoConfigUtil.setInputFormat(jobConf, MongoTenantAndIdInputFormat.class); MongoConfigUtil.setOutputFormat(jobConf, MongoAggFormatter.class); /** * Configure how hadoop calculates splits. * * We enable input splits to avoid having the entire job executed on a single hadoop node. * * We enable shard chunk splitting to allow mongo to specify how to split the input. * * We disable read splits from shards because we want hadoop connecting to mongos, not * mongod directly. This avoids incorrect results in situations where data is in the process * of migration at the same time hadoop is trying to read it. * * TODO - determine if we also need to set the input split key pattern. This depends * on how well data is distributed by _id. Setting the key pattern gives finer grained * control over how splits are calculated. */ MongoConfigUtil.setCreateInputSplits(jobConf, true); MongoConfigUtil.setShardChunkSplittingEnabled(jobConf, true); MongoConfigUtil.setReadSplitsFromShards(jobConf, false); MongoConfigUtil.setOutputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + reduceCollection); jobConf.setJarByClass(JobConfiguration.class); MongoConfigUtil.setMapper(jobConf, mapperClass); jobConf.setClass(JobContext.MAP_CLASS_ATTR, mapperClass, Mapper.class); MongoConfigUtil.setReducer(jobConf, reducerClass); jobConf.setClass(JobContext.REDUCE_CLASS_ATTR, reducerClass, Reducer.class); // Set this relatively high to keep the total map execution time low. // Formula: 1.75 * (# nodes * max tasks) // TODO : replace this hardcoded value with one calculated from configuration information. jobConf.setNumReduceTasks(52); // Add the configuration itself to the JobConf. JobConfiguration.toHadoopConfiguration(s, jobConf); return jobConf; }