List of usage examples for org.apache.hadoop.mapred JobConf getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:org.apache.ignite.internal.processors.hadoop.v2.HadoopV2JobResourceManager.java
License:Apache License
/** * Set working directory in local file system. * * @param dir Working directory.//from w w w .j a v a2 s. c om * @throws IOException If fails. */ private void setLocalFSWorkingDirectory(File dir) throws IOException { JobConf cfg = ctx.getJobConf(); Thread.currentThread().setContextClassLoader(cfg.getClassLoader()); try { cfg.set(HadoopFileSystemsUtils.LOC_FS_WORK_DIR_PROP, dir.getAbsolutePath()); if (!cfg.getBoolean("fs.file.impl.disable.cache", false)) FileSystem.getLocal(cfg).setWorkingDirectory(new Path(dir.getAbsolutePath())); } finally { Thread.currentThread().setContextClassLoader(null); } }
From source file:org.apache.mahout.avro.text.mapred.WikipediaAvroDocumentMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {//from w w w .ja va 2 s.c o m if (inputCategories == null) { Set<String> newCategories = new HashSet<String>(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(job, GenericsUtil.getClass(newCategories)); String categoriesStr = setStringifier.toString(newCategories); categoriesStr = job.get("wikipedia.categories", categoriesStr); inputCategories = setStringifier.fromString(categoriesStr); } exactMatchOnly = job.getBoolean("exact.match.only", false); all = job.getBoolean("all.files", true); } catch (IOException ex) { throw new IllegalStateException(ex); } log.info("Configure: Input Categories size: " + inputCategories.size() + " All: " + all + " Exact Match: " + exactMatchOnly); }
From source file:org.apache.nutch.crawl.CrawlDb.java
License:Apache License
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); JobConf job = CrawlDb.createJob(getConf(), crawlDb); job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting at " + sdf.format(start)); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segments: " + Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); LOG.info("CrawlDb update: 404 purging: " + url404Purging); }/*from www . ja v a 2 s . com*/ for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch) && fs.exists(parse)) { FileInputFormat.addInputPath(job, fetch); FileInputFormat.addInputPath(job, parse); } else { LOG.info(" - skipping invalid segment " + segments[i]); } } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: Merging segment data into db."); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) fs.delete(outPath, true); throw e; } CrawlDb.install(job, crawlDb); long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.crawl.CrawlDbFilter.java
License:Apache License
public void configure(JobConf job) { urlFiltering = job.getBoolean(URL_FILTERING, false); urlNormalizers = job.getBoolean(URL_NORMALIZING, false); url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false); if (urlFiltering) { filters = new URLFilters(job); }/*from w ww. j a v a 2 s.com*/ if (urlNormalizers) { scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB); normalizers = new URLNormalizers(job, scope); } }
From source file:org.apache.nutch.crawl.CrawlDbReducer.java
License:Apache License
public void configure(JobConf job) { retryMax = job.getInt("db.fetch.retry.max", 3); scfilters = new ScoringFilters(job); additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); int oldMaxInterval = job.getInt("db.max.fetch.interval", 0); maxInterval = job.getInt("db.fetch.interval.max", 0); if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; schedule = FetchScheduleFactory.getFetchSchedule(job); int maxLinks = job.getInt("db.update.max.inlinks", 10000); linked = new InlinkPriorityQueue(maxLinks); }
From source file:org.apache.nutch.crawl.LinkDb.java
License:Apache License
public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException { JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter); Path lock = new Path(linkDb, LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); Path currentLinkDb = new Path(linkDb, CURRENT_NAME); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("LinkDb: starting at " + sdf.format(start)); LOG.info("LinkDb: linkdb: " + linkDb); LOG.info("LinkDb: URL normalize: " + normalize); LOG.info("LinkDb: URL filter: " + filter); if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) { LOG.info("LinkDb: internal links will be ignored."); }//from ww w . j a v a 2s. c om } for (int i = 0; i < segments.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("LinkDb: adding segment: " + segments[i]); } FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME)); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); throw e; } if (fs.exists(currentLinkDb)) { if (LOG.isInfoEnabled()) { LOG.info("LinkDb: merging with existing linkdb: " + linkDb); } // try to merge Path newLinkDb = FileOutputFormat.getOutputPath(job); job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter); FileInputFormat.addInputPath(job, currentLinkDb); FileInputFormat.addInputPath(job, newLinkDb); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(newLinkDb, true); throw e; } fs.delete(newLinkDb, true); } LinkDb.install(job, linkDb); long end = System.currentTimeMillis(); LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.crawl.LinkDbFilter.java
License:Apache License
public void configure(JobConf job) { filter = job.getBoolean(URL_FILTERING, false); normalize = job.getBoolean(URL_NORMALIZING, false); if (filter) { filters = new URLFilters(job); }/*from w w w .j a v a 2s .c om*/ if (normalize) { scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB); normalizers = new URLNormalizers(job, scope); } }
From source file:org.apache.nutch.parse.ParseOutputFormat.java
License:Apache License
public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { if (job.getBoolean("parse.filter.urls", true)) { filters = new URLFilters(job); }//from w w w . ja va 2 s .co m if (job.getBoolean("parse.normalize.urls", true)) { normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); } this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); final boolean isParsing = job.getBoolean("fetcher.parse", true); final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); Path out = FileOutputFormat.getOutputPath(job); Path text = new Path(new Path(out, ParseText.DIR_NAME), name); Path data = new Path(new Path(out, ParseData.DIR_NAME), name); Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *"); final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, CompressionType.RECORD, progress); final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class, compType, progress); final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class, compType, progress); return new RecordWriter<Text, Parse>() { public void write(Text key, Parse parse) throws IOException { String fromUrl = key.toString(); String fromHost = null; textOut.append(key, new ParseText(parse.getText())); ParseData parseData = parse.getData(); // recover the signature prepared by Fetcher or ParseSegment String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); if (sig != null) { byte[] signature = StringUtil.fromHexString(sig); if (signature != null) { // append a CrawlDatum with a signature CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); d.setSignature(signature); crawlOut.append(key, d); } } // see if the parse metadata contain things that we'd like // to pass to the metadata of the crawlDB entry CrawlDatum parseMDCrawlDatum = null; for (String mdname : parseMDtoCrawlDB) { String mdvalue = parse.getData().getParseMeta().get(mdname); if (mdvalue != null) { if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0); parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue)); } } if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum); try { ParseStatus pstatus = parseData.getStatus(); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); try { if (normalizers != null) { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); } } catch (MalformedURLException mfue) { newUrl = null; } if (filters != null) { if (newUrl != null) newUrl = filters.filter(newUrl); } String url = key.toString(); if (newUrl != null && !newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME); CrawlDatum newDatum = new CrawlDatum(); newDatum.setStatus(CrawlDatum.STATUS_LINKED); if (reprUrl != null && !reprUrl.equals(newUrl)) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } crawlOut.append(new Text(newUrl), newDatum); } } } catch (URLFilterException e) { // ignore } // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks) { try { fromHost = new URL(fromUrl).getHost().toLowerCase(); } catch (MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; CrawlDatum adjust = null; List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore); List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); // Only normalize and filter if fetcher.parse = false if (!isParsing) { toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks, filters, normalizers); if (toUrl == null) { continue; } } CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); Text targetUrl = new Text(toUrl); try { scfilters.initialScore(targetUrl, target); } catch (ScoringFilterException e) { LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage()); target.setScore(0.0f); } targets.add(new SimpleEntry(targetUrl, target)); // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174) links[i].setUrl(toUrl); outlinkList.add(links[i]); validCount++; } try { // compute score contributions and adjustment to the original score adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null, links.length); } catch (ScoringFilterException e) { LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage()); } for (Entry<Text, CrawlDatum> target : targets) { crawlOut.append(target.getKey(), target.getValue()); } if (adjust != null) crawlOut.append(key, adjust); Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]); parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta()); dataOut.append(key, parseData); if (!parse.isCanonical()) { CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY); try { datum.setFetchTime(Long.parseLong(timeString)); } catch (Exception e) { LOG.warn("Can't read fetch time for: " + key); datum.setFetchTime(System.currentTimeMillis()); } crawlOut.append(key, datum); } } public void close(Reporter reporter) throws IOException { textOut.close(); dataOut.close(); crawlOut.close(); } }; }
From source file:org.apache.nutch.parse.ParseSegment.java
License:Apache License
public void configure(JobConf job) { setConf(job);/* w w w. ja v a2s . c o m*/ this.scfilters = new ScoringFilters(job); skipTruncated = job.getBoolean(SKIP_TRUNCATED, true); }
From source file:org.apache.nutch.tools.compat.CrawlDbConverter.java
License:Apache License
public void configure(JobConf job) { setConf(job); withMetadata = job.getBoolean(CONVERT_META_KEY, false); newKey = new Text(); }