List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:org.apache.nutch.crawl.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.//from w w w. j a v a 2s. c o m * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:org.apache.nutch.fetcher.Fetcher.java
License:Apache License
public void fetch(Path segment, int threads) throws IOException { checkConfiguration();// www . j a va 2s . c om SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(Fetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.fetcher.Fetcher2.java
License:Apache License
public void fetch(Path segment, int threads, boolean parsing) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting"); LOG.info("Fetcher: segment: " + segment); }//from w w w. j av a 2s . c o m JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setBoolean("fetcher.parse", parsing); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(Fetcher2.class); job.setOutputPath(segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FetcherOutput.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); } }
From source file:org.apache.nutch.fetcher.OldFetcher.java
License:Apache License
public void fetch(Path segment, int threads) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: starting at " + sdf.format(start)); LOG.info("OldFetcher: segment: " + segment); }/*from w ww. j av a2 s. c o m*/ JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(OldFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.indexer.DeleteDuplicates.java
License:Apache License
public void dedup(Path[] indexDirs) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }/*from w w w .j a v a2s . c o m*/ Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(getConf()); for (int i = 0; i < indexDirs.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Dedup: adding indexes in: " + indexDirs[i]); } job.addInputPath(indexDirs[i]); } job.setJobName("dedup 1: urls by time"); job.setInputFormat(InputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IndexDoc.class); job.setReducerClass(UrlsReducer.class); job.setOutputPath(outDir1); job.setOutputKeyClass(MD5Hash.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); job = new NutchJob(getConf()); job.setJobName("dedup 2: content by hash"); job.addInputPath(outDir1); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(MD5Hash.class); job.setMapOutputValueClass(IndexDoc.class); job.setPartitionerClass(HashPartitioner.class); job.setSpeculativeExecution(false); job.setReducerClass(HashReducer.class); job.setOutputPath(outDir2); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); // remove outDir1 - no longer needed fs.delete(outDir1); job = new NutchJob(getConf()); job.setJobName("dedup 3: delete from index(es)"); job.addInputPath(outDir2); job.setInputFormat(SequenceFileInputFormat.class); //job.setInputKeyClass(Text.class); //job.setInputValueClass(IndexDoc.class); job.setInt("io.file.buffer.size", 4096); job.setMapperClass(DeleteDuplicates.class); job.setReducerClass(DeleteDuplicates.class); job.setOutputFormat(DeleteDuplicates.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); fs.delete(outDir2); if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); } }
From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java
License:Apache License
public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException { checkConfiguration();/*from w ww. j ava 2 s . c o m*/ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); // push the zipped_webdriver binaries onto the DistributedCache DistributedCache.addCacheArchive(new URI(zippedDriverPath), job); job.set("webdriver.binaries.path", zippedDriverPath); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(SeleniumFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.oozie.action.hadoop.TestLauncher.java
License:Apache License
private RunningJob _test(String... arg) throws Exception { Path actionDir = getFsTestCaseDir(); File jar = IOUtils.createJar(new File(getTestCaseDir()), "launcher.jar", LauncherMapper.class, LauncherMainException.class, LauncherSecurityManager.class, LauncherException.class, LauncherMainTester.class); FileSystem fs = getFileSystem(); Path launcherJar = new Path(actionDir, "launcher.jar"); fs.copyFromLocalFile(new Path(jar.toString()), launcherJar); JobConf jobConf = Services.get().get(HadoopAccessorService.class) .createJobConf(new URI(getNameNodeUri()).getAuthority()); // jobConf.setJar(jar.getAbsolutePath()); jobConf.set("user.name", getTestUser()); jobConf.setInt("mapred.map.tasks", 1); jobConf.setInt("mapred.map.max.attempts", 1); jobConf.setInt("mapred.reduce.max.attempts", 1); jobConf.set("mapreduce.framework.name", "yarn"); jobConf.set("mapred.job.tracker", getJobTrackerUri()); jobConf.set("fs.default.name", getNameNodeUri()); LauncherMapperHelper.setupMainClass(jobConf, LauncherMainTester.class.getName()); LauncherMapperHelper.setupMainArguments(jobConf, arg); Configuration actionConf = new XConfiguration(); LauncherMapperHelper.setupLauncherInfo(jobConf, "1", "1@a", actionDir, "1@a-0", actionConf, ""); LauncherMapperHelper.setupYarnRestartHandling(jobConf, jobConf, "1@a", System.currentTimeMillis()); assertEquals("1", actionConf.get("oozie.job.id")); assertEquals("1@a", actionConf.get("oozie.action.id")); DistributedCache.addFileToClassPath(new Path(launcherJar.toUri().getPath()), jobConf); JobClient jobClient = createJobClient(); final RunningJob runningJob = jobClient.submitJob(jobConf); System.out.println("Action Dir: " + actionDir); System.out.println("LauncherMapper ID: " + runningJob.getJobID().toString()); waitFor(180 * 1000, new Predicate() { public boolean evaluate() throws Exception { return runningJob.isComplete(); }//from w ww .jav a 2 s . c om }); assertTrue(jobConf.get("oozie.action.prepare.xml").equals("")); return runningJob; }
From source file:org.apache.phoenix.hive.mapreduce.PhoenixInputFormat.java
License:Apache License
private void setScanCacheSize(JobConf jobConf) { int scanCacheSize = jobConf.getInt(PhoenixStorageHandlerConstants.HBASE_SCAN_CACHE, -1); if (scanCacheSize > 0) { jobConf.setInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, scanCacheSize); }/* ww w.j av a 2 s . c o m*/ if (LOG.isDebugEnabled()) { LOG.debug("Generating splits with scanCacheSize : " + scanCacheSize); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java
License:Apache License
@Override protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-DPMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(DataPartitionerRemoteMR.class); if (_pfid >= 0) //use in parfor job.setJobName(jobname + _pfid); else //use for partition instruction job.setJobName("Partition-MR"); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try {//w ww. jav a 2 s . c o m //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size) in.exportData(); //written to disk iff dirty Path path = new Path(in.getFileName()); ///// //configure the MR job MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew, _keepIndexes); //set mappers, reducers, combiners job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(DataPartitionerRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { //binary cell intermediates for reduced IO job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableBlock.class); //check Alignment if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0) || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) { throw new DMLRuntimeException( "Data partitioning format " + _format + " requires aligned blocks."); } } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path and output path FileInputFormat.setInputPaths(job, path); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); //FileOutputFormat.setOutputPath(job, pathNew); job.setOutputFormat(NullOutputFormat.class); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = -1; switch (_format) { case ROW_WISE: reducerGroups = rlen; break; case COLUMN_WISE: reducerGroups = clen; break; case ROW_BLOCK_WISE: reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE: reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1); break; case ROW_BLOCK_WISE_N: reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE_N: reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1); break; default: //do nothing } job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true"); //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS && _pfid >= 0) { long t1 = System.nanoTime(); //only for parfor Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteDPParForMR.java
License:Apache License
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, PartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params boolean enableCPCaching, int numReducers, int replication) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null;/*from w w w. ja v a 2 s.com*/ String jobname = "ParFor-DPEMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteDPParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the reducers MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //setup input matrix Path path = new Path(input.getFileName()); long rlen = input.getNumRows(); long clen = input.getNumColumns(); int brlen = (int) input.getNumRowsPerBlock(); int bclen = (int) input.getNumColumnsPerBlock(); MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf._dpf, dpf._N, input.getFileName(), itervar, matrixvar, tSparseCol); job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass); FileInputFormat.setInputPaths(job, path); //set mapper and reducers classes job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(RemoteDPParWorkerReducer.class); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema //parfor partitioning outputs (intermediates) job.setMapOutputKeyClass(LongWritable.class); if (oi == OutputInfo.BinaryBlockOutputInfo) job.setMapOutputValueClass(PairWritableBlock.class); else if (oi == OutputInfo.BinaryCellOutputInfo) job.setMapOutputValueClass(PairWritableCell.class); else throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi); //parfor exec output job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumReduceTasks(numReducers); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //disable JVM reuse job.setNumTasksToExecutePerJvm(1); //-1 for unlimited //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set the max number of retries per map task //note: currently disabled to use cluster config //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }