List of usage examples for org.apache.hadoop.mapred JobConf getInt
public int getInt(String name, int defaultValue)
name
property as an int
. From source file:org.apache.nutch.parse.ParseOutputFormat.java
License:Apache License
public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { if (job.getBoolean("parse.filter.urls", true)) { filters = new URLFilters(job); }//from w w w. j a v a2 s. co m if (job.getBoolean("parse.normalize.urls", true)) { normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); } this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); final boolean isParsing = job.getBoolean("fetcher.parse", true); final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); Path out = FileOutputFormat.getOutputPath(job); Path text = new Path(new Path(out, ParseText.DIR_NAME), name); Path data = new Path(new Path(out, ParseData.DIR_NAME), name); Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *"); final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, CompressionType.RECORD, progress); final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class, compType, progress); final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class, compType, progress); return new RecordWriter<Text, Parse>() { public void write(Text key, Parse parse) throws IOException { String fromUrl = key.toString(); String fromHost = null; textOut.append(key, new ParseText(parse.getText())); ParseData parseData = parse.getData(); // recover the signature prepared by Fetcher or ParseSegment String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); if (sig != null) { byte[] signature = StringUtil.fromHexString(sig); if (signature != null) { // append a CrawlDatum with a signature CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); d.setSignature(signature); crawlOut.append(key, d); } } // see if the parse metadata contain things that we'd like // to pass to the metadata of the crawlDB entry CrawlDatum parseMDCrawlDatum = null; for (String mdname : parseMDtoCrawlDB) { String mdvalue = parse.getData().getParseMeta().get(mdname); if (mdvalue != null) { if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0); parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue)); } } if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum); try { ParseStatus pstatus = parseData.getStatus(); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); try { if (normalizers != null) { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); } } catch (MalformedURLException mfue) { newUrl = null; } if (filters != null) { if (newUrl != null) newUrl = filters.filter(newUrl); } String url = key.toString(); if (newUrl != null && !newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME); CrawlDatum newDatum = new CrawlDatum(); newDatum.setStatus(CrawlDatum.STATUS_LINKED); if (reprUrl != null && !reprUrl.equals(newUrl)) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } crawlOut.append(new Text(newUrl), newDatum); } } } catch (URLFilterException e) { // ignore } // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks) { try { fromHost = new URL(fromUrl).getHost().toLowerCase(); } catch (MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; CrawlDatum adjust = null; List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore); List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); // Only normalize and filter if fetcher.parse = false if (!isParsing) { toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks, filters, normalizers); if (toUrl == null) { continue; } } CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); Text targetUrl = new Text(toUrl); try { scfilters.initialScore(targetUrl, target); } catch (ScoringFilterException e) { LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage()); target.setScore(0.0f); } targets.add(new SimpleEntry(targetUrl, target)); // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174) links[i].setUrl(toUrl); outlinkList.add(links[i]); validCount++; } try { // compute score contributions and adjustment to the original score adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null, links.length); } catch (ScoringFilterException e) { LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage()); } for (Entry<Text, CrawlDatum> target : targets) { crawlOut.append(target.getKey(), target.getValue()); } if (adjust != null) crawlOut.append(key, adjust); Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]); parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta()); dataOut.append(key, parseData); if (!parse.isCanonical()) { CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY); try { datum.setFetchTime(Long.parseLong(timeString)); } catch (Exception e) { LOG.warn("Can't read fetch time for: " + key); datum.setFetchTime(System.currentTimeMillis()); } crawlOut.append(key, datum); } } public void close(Reporter reporter) throws IOException { textOut.close(); dataOut.close(); crawlOut.close(); } }; }
From source file:org.apache.phoenix.hive.mapreduce.PhoenixInputFormat.java
License:Apache License
private void setScanCacheSize(JobConf jobConf) { int scanCacheSize = jobConf.getInt(PhoenixStorageHandlerConstants.HBASE_SCAN_CACHE, -1); if (scanCacheSize > 0) { jobConf.setInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, scanCacheSize); }// www . jav a 2s.co m if (LOG.isDebugEnabled()) { LOG.debug("Generating splits with scanCacheSize : " + scanCacheSize); } }
From source file:org.apache.phoenix.hive.PhoenixStorageHandler.java
License:Apache License
@Override public Estimation estimate(JobConf job, TableScanOperator ts, long remaining) throws HiveException { String hiveTableName = ts.getConf().getTableMetadata().getTableName(); int reducerCount = job.getInt(hiveTableName + PhoenixStorageHandlerConstants.PHOENIX_REDUCER_NUMBER, 1); if (LOG.isDebugEnabled()) { LOG.debug("Estimating input size for table: " + hiveTableName + " with reducer count " + reducerCount + ". Remaining : " + remaining); }// www. ja va 2s . c om long bytesPerReducer = job.getLong(HiveConf.ConfVars.BYTESPERREDUCER.varname, Long.parseLong(HiveConf.ConfVars.BYTESPERREDUCER.getDefaultValue())); long totalLength = reducerCount * bytesPerReducer; return new Estimation(0, totalLength); }
From source file:org.apache.pig.impl.util.avro.AvroRecordWriter.java
License:Apache License
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer, JobConf job) throws UnsupportedEncodingException { if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(factory);/* w ww . j av a 2 s . c om*/ } // Do max as core-default.xml has io.file.buffer.size as 4K writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL))); // copy metadata from job for (Map.Entry<String, String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1")); } }
From source file:org.apache.sysml.runtime.io.WriterBinaryBlock.java
License:Apache License
@SuppressWarnings("deprecation") protected final void writeBinaryBlockMatrixToSequenceFile(Path path, JobConf job, FileSystem fs, MatrixBlock src, int brlen, int bclen, int rl, int ru) throws DMLRuntimeException, IOException { boolean sparse = src.isInSparseFormat(); int rlen = src.getNumRows(); int clen = src.getNumColumns(); // 1) create sequence file writer, with right replication factor // (config via MRConfigurationNames.DFS_REPLICATION not possible since sequence file internally calls fs.getDefaultReplication()) SequenceFile.Writer writer = null; if (_replication > 0) //if replication specified (otherwise default) {/* w ww . j av a 2 s .co m*/ //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class, job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096), (short) _replication, fs.getDefaultBlockSize(), null, new SequenceFile.Metadata()); } else { writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class); } try { // 2) bound check for src block if (src.getNumRows() > rlen || src.getNumColumns() > clen) { throw new IOException("Matrix block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } //3) reblock and write MatrixIndexes indexes = new MatrixIndexes(); if (rlen <= brlen && clen <= bclen && rl == 0) //opt for single block { //directly write single block indexes.setIndexes(1, 1); writer.append(indexes, src); } else //general case { //initialize blocks for reuse (at most 4 different blocks required) MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse, src.getNonZeros()); //create and write subblocks of matrix for (int blockRow = rl / brlen; blockRow < (int) Math.ceil(ru / (double) brlen); blockRow++) for (int blockCol = 0; blockCol < (int) Math .ceil(src.getNumColumns() / (double) bclen); blockCol++) { int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen : src.getNumRows() - blockRow * brlen; int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen : src.getNumColumns() - blockCol * bclen; int row_offset = blockRow * brlen; int col_offset = blockCol * bclen; //get reuse matrix block MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen); //copy submatrix to block src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset, col_offset + maxCol - 1, block); //append block to sequence file indexes.setIndexes(blockRow + 1, blockCol + 1); writer.append(indexes, block); //reset block for later reuse block.reset(); } } } finally { IOUtilFunctions.closeSilently(writer); } }
From source file:org.apache.sysml.runtime.io.WriterBinaryBlock.java
License:Apache License
@SuppressWarnings("deprecation") protected final void writeDiagBinaryBlockMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { boolean sparse = src.isInSparseFormat(); // 1) create sequence file writer, with right replication factor // (config via MRConfigurationNames.DFS_REPLICATION not possible since sequence file internally calls fs.getDefaultReplication()) SequenceFile.Writer writer = null; if (_replication > 0) //if replication specified (otherwise default) {/*www . ja v a 2 s.c om*/ //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class, job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096), (short) _replication, fs.getDefaultBlockSize(), null, new SequenceFile.Metadata()); } else { writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class); } try { // 2) bound check for src block if (src.getNumRows() > rlen || src.getNumColumns() > clen) { throw new IOException("Matrix block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } //3) reblock and write MatrixIndexes indexes = new MatrixIndexes(); if (rlen <= brlen && clen <= bclen) //opt for single block { //directly write single block indexes.setIndexes(1, 1); writer.append(indexes, src); } else //general case { //initialize blocks for reuse (at most 4 different blocks required) MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse, src.getNonZeros()); MatrixBlock emptyBlock = new MatrixBlock(); //create and write subblocks of matrix for (int blockRow = 0; blockRow < (int) Math.ceil(src.getNumRows() / (double) brlen); blockRow++) for (int blockCol = 0; blockCol < (int) Math .ceil(src.getNumColumns() / (double) bclen); blockCol++) { int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen : src.getNumRows() - blockRow * brlen; int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen : src.getNumColumns() - blockCol * bclen; MatrixBlock block = null; if (blockRow == blockCol) //block on diagonal { int row_offset = blockRow * brlen; int col_offset = blockCol * bclen; //get reuse matrix block block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen); //copy submatrix to block src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset, col_offset + maxCol - 1, block); } else //empty block (not on diagonal) { block = emptyBlock; block.reset(maxRow, maxCol); } //append block to sequence file indexes.setIndexes(blockRow + 1, blockCol + 1); writer.append(indexes, block); //reset block for later reuse if (blockRow != blockCol) block.reset(); } } } finally { IOUtilFunctions.closeSilently(writer); } }
From source file:org.apache.sysml.runtime.matrix.data.UnPaddedOutputFormat.java
License:Apache License
@Override public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, true, job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096), progress); return new UnpaddedRecordWriter<>(fileOut); }
From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java
License:Apache License
public static final int getMiscMemRequired(JobConf job) { return job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096); }
From source file:org.apache.tez.mapreduce.client.YARNRunner.java
License:Apache License
@Override public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts) throws IOException, InterruptedException { // HACK! TEZ-604. Get rid of this once Hive moves all of it's tasks over to Tez native. maybeKillSession();/*from ww w. ja va 2 s.c o m*/ ApplicationId appId = resMgrDelegate.getApplicationId(); FileSystem fs = FileSystem.get(conf); // Loads the job.xml written by the user. JobConf jobConf = new JobConf(new TezConfiguration(conf)); // Extract individual raw MR configs. Configuration[] stageConfs = MultiStageMRConfToTezTranslator.getStageConfs(jobConf); // Transform all confs to use Tez keys for (int i = 0; i < stageConfs.length; i++) { MRHelpers.translateMRConfToTez(stageConfs[i]); } // create inputs to tezClient.submit() // FIXME set up job resources Map<String, LocalResource> jobLocalResources = createJobLocalResources(stageConfs[0], jobSubmitDir); // FIXME createDAG should take the tezConf as a parameter, instead of using // MR keys. DAG dag = createDAG(fs, jobId, stageConfs, jobSubmitDir, ts, jobLocalResources); List<String> vargs = new LinkedList<String>(); // admin command opts and user command opts String mrAppMasterAdminOptions = conf.get(MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS, MRJobConfig.DEFAULT_MR_AM_ADMIN_COMMAND_OPTS); warnForJavaLibPath(mrAppMasterAdminOptions, "app master", MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS, MRJobConfig.MR_AM_ADMIN_USER_ENV); vargs.add(mrAppMasterAdminOptions); // Add AM user command opts String mrAppMasterUserOptions = conf.get(MRJobConfig.MR_AM_COMMAND_OPTS, MRJobConfig.DEFAULT_MR_AM_COMMAND_OPTS); warnForJavaLibPath(mrAppMasterUserOptions, "app master", MRJobConfig.MR_AM_COMMAND_OPTS, MRJobConfig.MR_AM_ENV); vargs.add(mrAppMasterUserOptions); StringBuilder javaOpts = new StringBuilder(); for (String varg : vargs) { javaOpts.append(varg).append(" "); } // Setup the CLASSPATH in environment // i.e. add { Hadoop jars, job jar, CWD } to classpath. Map<String, String> environment = new HashMap<String, String>(); // Setup the environment variables for AM MRHelpers.updateEnvBasedOnMRAMEnv(conf, environment); StringBuilder envStrBuilder = new StringBuilder(); boolean first = true; for (Entry<String, String> entry : environment.entrySet()) { if (!first) { envStrBuilder.append(","); } else { first = false; } envStrBuilder.append(entry.getKey()).append("=").append(entry.getValue()); } String envStr = envStrBuilder.toString(); TezConfiguration dagAMConf = getDAGAMConfFromMRConf(); dagAMConf.set(TezConfiguration.TEZ_AM_LAUNCH_CMD_OPTS, javaOpts.toString()); if (envStr.length() > 0) { dagAMConf.set(TezConfiguration.TEZ_AM_LAUNCH_ENV, envStr); if (LOG.isDebugEnabled()) { LOG.debug("Setting MR AM env to : " + envStr); } } // Submit to ResourceManager try { dagAMConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, jobSubmitDir); // Set Tez parameters based on MR parameters. String queueName = jobConf.get(JobContext.QUEUE_NAME, YarnConfiguration.DEFAULT_QUEUE_NAME); dagAMConf.set(TezConfiguration.TEZ_QUEUE_NAME, queueName); int amMemMB = jobConf.getInt(MRJobConfig.MR_AM_VMEM_MB, MRJobConfig.DEFAULT_MR_AM_VMEM_MB); int amCores = jobConf.getInt(MRJobConfig.MR_AM_CPU_VCORES, MRJobConfig.DEFAULT_MR_AM_CPU_VCORES); dagAMConf.setInt(TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB, amMemMB); dagAMConf.setInt(TezConfiguration.TEZ_AM_RESOURCE_CPU_VCORES, amCores); dagAMConf.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, jobConf.getInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, MRJobConfig.DEFAULT_MR_AM_MAX_ATTEMPTS)); tezClient = new MRTezClient("MapReduce", dagAMConf, false, jobLocalResources, ts); tezClient.start(); tezClient.submitDAGApplication(appId, dag); tezClient.stop(); } catch (TezException e) { throw new IOException(e); } return getJobStatus(jobId); }
From source file:org.apache.tez.mapreduce.hadoop.TestDeprecatedKeys.java
License:Apache License
@Test(timeout = 5000) public void verifyReduceKeyTranslation() { JobConf jobConf = new JobConf(); jobConf.setFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.4f); jobConf.setLong(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES, 20000l); jobConf.setInt(MRJobConfig.IO_SORT_FACTOR, 2000); jobConf.setFloat(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, 0.55f); jobConf.setFloat(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, 0.60f); jobConf.setFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.22f); jobConf.setBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, true); jobConf.setFloat(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT, 0.33f); MRHelpers.translateMRConfToTez(jobConf); assertEquals(0.4f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0f), 0.01f);/*from www .j a v a 2s . c o m*/ assertEquals(20000l, jobConf.getLong(Constants.TEZ_RUNTIME_TASK_MEMORY, 0)); assertEquals(2000, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 0)); assertEquals(0.55f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 0), 0.01f); assertEquals(0.60f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 0), 0.01f); assertEquals(0.22f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 0), 0.01f); assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, false)); assertEquals(0.33f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 0), 0.01f); }