List of usage examples for org.apache.hadoop.mapred JobConf getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:org.archive.jbs.solr.SolrOutputFormat.java
License:Apache License
public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name, final Progressable progress) throws IOException { String serverUrl = job.get("jbs.solr.url", "http://localhost:8983/solr"); int docBufSize = job.getInt("jbs.solr.bufSize", 10); SolrDocumentWriter solrDocWriter = new SolrDocumentWriter(new URL(serverUrl), docBufSize); // FIXME: Temporary collection hack solrDocWriter.collectionHack = job.get("jbs.solr.collectionHack", null); TypeNormalizer normalizer = new TypeNormalizer(); Map<String, String> aliases = normalizer.parseAliases(job.get("jbs.typeNormalizer.aliases", "")); if (job.getBoolean("jbs.typeNormalizer.useDefaults", true)) { Map<String, String> defaults = normalizer.getDefaultAliases(); defaults.putAll(aliases);/*from www . j a v a 2 s. c om*/ aliases = defaults; } normalizer.setAliases(aliases); TypeFilter typeFilter = new TypeFilter(); Set<String> allowedTypes = typeFilter.parse(job.get("jbs.typeFilter.allowed", "")); if (job.getBoolean("jbs.typeFilter.useDefaults", true)) { Set<String> defaults = typeFilter.getDefaultAllowed(); defaults.addAll(allowedTypes); allowedTypes = defaults; } typeFilter.setAllowed(allowedTypes); typeFilter.setTypeNormalizer(normalizer); solrDocWriter.setFilter("reqFields", new RequiredFieldsFilter()); solrDocWriter.setFilter("type", typeFilter); solrDocWriter.setFilter("robots", new RobotsFilter()); solrDocWriter.setTypeNormalizer(normalizer); solrDocWriter.setIDNHelper(buildIDNHelper(job)); return new SolrRecordWriter(solrDocWriter); }
From source file:org.archive.jbs.solr.SolrOutputFormat.java
License:Apache License
/** * Build an IDNHelper object using configuration information in the JobConf. *//*from w w w . j a v a 2 s .co m*/ protected IDNHelper buildIDNHelper(JobConf job) throws IOException { IDNHelper helper = new IDNHelper(); if (job.getBoolean("jbs.idnHelper.useDefaults", true)) { InputStream is = this.getClass().getClassLoader().getResourceAsStream("effective_tld_names.dat"); if (is == null) { throw new RuntimeException("Cannot load default tld rules: effective_tld_names.dat"); } Reader reader = new InputStreamReader(is, "utf-8"); helper.addRules(reader); } String moreRules = job.get("jbs.idnHelper.moreRules", ""); if (moreRules.length() > 0) { helper.addRules(new StringReader(moreRules)); } return helper; }
From source file:org.archive.nutchwax.PageRankDb.java
License:Apache License
public void configure(JobConf job) { ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true); if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) { urlFilters = new URLFilters(job); }/*from www . j a va 2 s . co m*/ if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) { urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB); } }
From source file:org.cloudata.core.parallel.hadoop.TableRowRecordReader.java
License:Apache License
public TableRowRecordReader(JobConf jobConf, CloudataConf conf, TableSplit tableSplit, Reporter reporter) throws IOException { this.jobConf = jobConf; this.reporter = reporter; this.skipError = jobConf.getBoolean(AbstractTabletInputFormat.SKIP_ERROR, false); RowFilter rowFilter = tableSplit.getRowFilter(); InputTableInfo inputTableInfo = tableSplit.getInputTableInfo(); this.startRowKey = rowFilter.getStartRowKey(); this.endRowKey = rowFilter.getEndRowKey(); LOG.info("TableSplit: " + inputTableInfo.getTableName() + ":" + startRowKey + " ~ " + endRowKey); if (reporter != null) { reporter.setStatus(inputTableInfo.getTableName() + ":" + startRowKey + " ~ " + endRowKey); }//from w w w . ja va 2s . c om CTable ctable = CTable.openTable(conf, inputTableInfo.getTableName()); try { this.scanner = ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT); } catch (IOException e) { if (this.scanner != null) { this.scanner.close(); } end = true; LOG.error("Error while scanner open:startRowKey=" + startRowKey + ", endRowKey=" + endRowKey, e); if (skipError) { reporter.setStatus("Error while scanner open:startRowKey=" + startRowKey + ", endRowKey=" + endRowKey + "," + e.getMessage()); reporter.incrCounter("cloudata", "error", 1); return; } else { throw e; } } //If Tablet isn't first tablet, skip first row. if (!startRowKey.equals(tableSplit.getJobStartRowKey())) { try { Row row = scanner.nextRow(); if (row == null) { end = true; return; } } catch (IOException e) { end = true; LOG.error("Error while scanner.nextRow():startRowKey=" + startRowKey + ", endRowKey=" + endRowKey, e); if (skipError) { reporter.setStatus("Error while scanner.nextRow():startRowKey=" + startRowKey + ", endRowKey=" + endRowKey + "," + e.getMessage()); reporter.incrCounter("cloudata", "error", 1); return; } else { throw e; } } } }
From source file:org.cloudata.core.parallel.hadoop.TableScanCellReader.java
License:Apache License
public TableScanCellReader(JobConf jobConf, CloudataConf conf, TableSplit tableSplit, Reporter reporter) throws IOException { this.jobConf = jobConf; this.reporter = reporter; this.skipError = jobConf.getBoolean(AbstractTabletInputFormat.SKIP_ERROR, false); RowFilter rowFilter = tableSplit.getRowFilter(); InputTableInfo inputTableInfo = tableSplit.getInputTableInfo(); Row.Key startRowKey = rowFilter.getStartRowKey(); reporter.setStatus(inputTableInfo.getTableName() + ":" + rowFilter.getStartRowKey() + " ~ " + rowFilter.getEndRowKey()); CTable ctable = CTable.openTable(conf, inputTableInfo.getTableName()); long startTime = System.currentTimeMillis(); try {//from w w w .j av a 2 s . c o m scanner = ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT); } catch (IOException e) { if (this.scanner != null) { this.scanner.close(); } end = true; LOG.error("Error while scanner open:startRowKey=" + startRowKey + "," + e.getMessage()); if (skipError) { reporter.setStatus( "Error while scanner open:startRowKey=" + startRowKey + ", endRowKey=" + e.getMessage()); reporter.incrCounter("cloudata", "error", 1); return; } else { throw e; } } try { // Tablet? ? row . scanCell = scanner.next(); if (scanCell == null) { end = true; scanner.close(); scanner = null; return; } if (!startRowKey.equals(Row.Key.MIN_KEY)) { Row.Key firstRow = scanCell.getRowKey(); while (firstRow.equals(scanCell.getRowKey())) { scanCell = scanner.next(); if (scanCell == null) { end = true; scanner.close(); scanner = null; break; } } } totalScanTime += (System.currentTimeMillis() - startTime); } catch (IOException e) { if (scanner == null) { scanner.close(); scanner = null; } LOG.error("Error while skip first row:startRowKey=" + startRowKey + "," + e.getMessage()); if (skipError) { reporter.setStatus( "Error while skip first row:startRowKey=" + startRowKey + ", endRowKey=" + e.getMessage()); reporter.incrCounter("cloudata", "error", 1); return; } else { throw e; } } }
From source file:org.cloudata.util.matrix.MatrixMutiplyMap.java
License:Apache License
public void configure(JobConf job) { CloudataConf conf = new CloudataConf(); boolean sparse = job.getBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, false); String targetTableName = job.get(MatrixInputFormat.MATRIX_TARGET_TABLE); String targetColumnName = job.get(MatrixInputFormat.MATRIX_TARGET_COLUMN); try {//from www . j a v a 2 s .c o m if (sparse) { targetMatrix = new SparseMatrix(conf, targetTableName, targetColumnName); } else { targetMatrix = new Matrix(conf, targetTableName, targetColumnName); } } catch (IOException e) { err = e; } }
From source file:org.cloudata.util.matrix.MatrixMutiplyReduce.java
License:Apache License
public void configure(JobConf job) { CloudataConf conf = new CloudataConf(); boolean sparse = job.getBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, false); String resultTableName = job.get(MatrixInputFormat.MATRIX_RESULT_TABLE); String resultColumnName = job.get(MatrixInputFormat.MATRIX_RESULT_COLUMN); try {//from ww w . j av a 2s . c o m if (sparse) { resultMatrix = new SparseMatrix(conf, resultTableName, resultColumnName); } else { resultMatrix = new Matrix(conf, resultTableName, resultColumnName); } resultMatrix.initUploader(); } catch (IOException e) { err = e; } }
From source file:org.cloudata.util.upload.UploadMap.java
License:Apache License
@Override public void configure(JobConf job) { try {/*ww w . ja v a 2s. c o m*/ String tableName = job.get(AbstractTabletInputFormat.OUTPUT_TABLE); CloudataConf nconf = new CloudataConf(job); ctable = CTable.openTable(nconf, tableName); if (ctable == null) { throw new IOException("No table:" + tableName); } delim = job.get("uploadJob.delim", "\t"); columns = job.get("uploadJob.columns").split(","); String[] fieldNumStr = job.get("uploadJob.fieldNums").split(","); fieldNums = new int[fieldNumStr.length]; for (int i = 0; i < fieldNumStr.length; i++) { fieldNums[i] = Integer.parseInt(fieldNumStr[i]); } keyValuePair = job.getBoolean("uploadJob.keyValuePair", false); } catch (Exception e) { err = new IOException(e.getMessage()); err.initCause(e); } }
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.NonSuperSubdomainCollectorStep.java
License:Open Source License
@Override public void configure(JobConf job) { if (job.getBoolean("mapred.task.is.map", false)) { Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH)); try {/*from w w w . j ava 2 s .c o m*/ superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } subDomainFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); } }
From source file:org.dkpro.bigdata.hadoop.UIMAMapReduceBase.java
License:Open Source License
@Override public void configure(JobConf job) { try {//w w w . jav a 2 s. c om this.job = job; this.inputName = job.get("mapred.input.dir"); this.taskId = job.get("mapred.task.id"); this.mapOutputValueClass = job.getMapOutputValueClass(); this.outputValueClass = job.getOutputValueClass(); this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100); final EngineFactory engineFactory = (EngineFactory) Class .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance(); engineFactory.configure(job); final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job); // replace the $dir variable within the configuration. this.fs = FileSystem.get(job); this.localFS = FileSystem.getLocal(job); if (job.getBoolean("dkpro.output.onedirpertask", true)) { this.working_dir = new Path("uima_output_" + job.get("mapred.task.id")); } else { this.working_dir = new Path("uima_output"); } final Path outputPath = FileOutputFormat.getOutputPath(job); this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName())); this.localFS.mkdirs(this.results_dir); final String[] resources = job.get("dkpro.resources", "").split(","); sLogger.info("Writing local data to: " + this.results_dir); this.resourceURIs = new TreeMap<String, URL>(); for (final String resource : resources) { final URL r = job.getResource(resource); if (r != null && !resource.isEmpty()) { this.resourceURIs.put(resource, r); } } Map<String, String> variableValues = new HashMap<String, String>(); variableValues.put("\\$dir", this.results_dir.toString()); variableValues.put("\\$input", this.inputName); variableValues.put("\\$taskid", this.taskId); Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job); if (cacheFiles != null) { for (Path cacheFile : cacheFiles) { variableValues.put("^\\$cache/" + cacheFile.getName(), cacheFile.toUri().getPath()); } } for (final Entry<String, URL> resource : this.resourceURIs.entrySet()) { variableValues.put("\\$" + resource, resource.getValue().toString()); } AnalysisEngineUtil.replaceVariables(engineDescription, variableValues); this.engine = createEngine(engineDescription); } catch (final Exception e) { sLogger.fatal("Error while configuring pipeline", e); e.printStackTrace(); throw new RuntimeException(e); } }