List of usage examples for org.apache.hadoop.mapred SequenceFileOutputFormat getOutputCompressionType
public static CompressionType getOutputCompressionType(JobConf conf)
From source file:org.apache.nutch.fetcher.FetcherOutputFormat.java
License:Apache License
public RecordWriter<Text, NutchWritable> getRecordWriter(final FileSystem fs, final JobConf job, final String name, final Progressable progress) throws IOException { Path out = FileOutputFormat.getOutputPath(job); final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name); final Path content = new Path(new Path(out, Content.DIR_NAME), name); final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); final MapFile.Writer fetchOut = new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class, compType, progress);//ww w . j a va 2 s. c o m return new RecordWriter<Text, NutchWritable>() { private MapFile.Writer contentOut; private RecordWriter<Text, Parse> parseOut; { if (Fetcher.isStoringContent(job)) { contentOut = new MapFile.Writer(job, fs, content.toString(), Text.class, Content.class, compType, progress); } if (Fetcher.isParsing(job)) { parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, progress); } } public void write(Text key, NutchWritable value) throws IOException { Writable w = value.get(); if (w instanceof CrawlDatum) fetchOut.append(key, w); else if (w instanceof Content) contentOut.append(key, w); else if (w instanceof Parse) parseOut.write(key, (Parse) w); } public void close(Reporter reporter) throws IOException { fetchOut.close(); if (contentOut != null) { contentOut.close(); } if (parseOut != null) { parseOut.close(reporter); } } }; }
From source file:org.apache.nutch.parse.ParseOutputFormat.java
License:Apache License
public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { if (job.getBoolean("parse.filter.urls", true)) { filters = new URLFilters(job); }/* w w w . j av a2s . com*/ if (job.getBoolean("parse.normalize.urls", true)) { normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); } this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); final boolean isParsing = job.getBoolean("fetcher.parse", true); final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); Path out = FileOutputFormat.getOutputPath(job); Path text = new Path(new Path(out, ParseText.DIR_NAME), name); Path data = new Path(new Path(out, ParseData.DIR_NAME), name); Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *"); final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, CompressionType.RECORD, progress); final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class, compType, progress); final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class, compType, progress); return new RecordWriter<Text, Parse>() { public void write(Text key, Parse parse) throws IOException { String fromUrl = key.toString(); String fromHost = null; textOut.append(key, new ParseText(parse.getText())); ParseData parseData = parse.getData(); // recover the signature prepared by Fetcher or ParseSegment String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); if (sig != null) { byte[] signature = StringUtil.fromHexString(sig); if (signature != null) { // append a CrawlDatum with a signature CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); d.setSignature(signature); crawlOut.append(key, d); } } // see if the parse metadata contain things that we'd like // to pass to the metadata of the crawlDB entry CrawlDatum parseMDCrawlDatum = null; for (String mdname : parseMDtoCrawlDB) { String mdvalue = parse.getData().getParseMeta().get(mdname); if (mdvalue != null) { if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0); parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue)); } } if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum); try { ParseStatus pstatus = parseData.getStatus(); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); try { if (normalizers != null) { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); } } catch (MalformedURLException mfue) { newUrl = null; } if (filters != null) { if (newUrl != null) newUrl = filters.filter(newUrl); } String url = key.toString(); if (newUrl != null && !newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME); CrawlDatum newDatum = new CrawlDatum(); newDatum.setStatus(CrawlDatum.STATUS_LINKED); if (reprUrl != null && !reprUrl.equals(newUrl)) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } crawlOut.append(new Text(newUrl), newDatum); } } } catch (URLFilterException e) { // ignore } // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks) { try { fromHost = new URL(fromUrl).getHost().toLowerCase(); } catch (MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; CrawlDatum adjust = null; List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore); List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); // Only normalize and filter if fetcher.parse = false if (!isParsing) { toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks, filters, normalizers); if (toUrl == null) { continue; } } CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); Text targetUrl = new Text(toUrl); try { scfilters.initialScore(targetUrl, target); } catch (ScoringFilterException e) { LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage()); target.setScore(0.0f); } targets.add(new SimpleEntry(targetUrl, target)); // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174) links[i].setUrl(toUrl); outlinkList.add(links[i]); validCount++; } try { // compute score contributions and adjustment to the original score adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null, links.length); } catch (ScoringFilterException e) { LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage()); } for (Entry<Text, CrawlDatum> target : targets) { crawlOut.append(target.getKey(), target.getValue()); } if (adjust != null) crawlOut.append(key, adjust); Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]); parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta()); dataOut.append(key, parseData); if (!parse.isCanonical()) { CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY); try { datum.setFetchTime(Long.parseLong(timeString)); } catch (Exception e) { LOG.warn("Can't read fetch time for: " + key); datum.setFetchTime(System.currentTimeMillis()); } crawlOut.append(key, datum); } } public void close(Reporter reporter) throws IOException { textOut.close(); dataOut.close(); crawlOut.close(); } }; }