Example usage for org.apache.hadoop.mapred SequenceFileOutputFormat getOutputCompressionType

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileOutputFormat getOutputCompressionType.

Prototype

public static CompressionType getOutputCompressionType(JobConf conf)

Source Link

Document

Get the CompressionType for the output SequenceFile .

Usage

From source file:org.apache.nutch.fetcher.FetcherOutputFormat.java

License:Apache License

public RecordWriter<Text, NutchWritable> getRecordWriter(final FileSystem fs, final JobConf job,
        final String name, final Progressable progress) throws IOException {

    Path out = FileOutputFormat.getOutputPath(job);
    final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
    final Path content = new Path(new Path(out, Content.DIR_NAME), name);

    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);

    final MapFile.Writer fetchOut = new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class,
            compType, progress);//ww  w  . j  a  va 2  s. c o  m

    return new RecordWriter<Text, NutchWritable>() {
        private MapFile.Writer contentOut;
        private RecordWriter<Text, Parse> parseOut;

        {
            if (Fetcher.isStoringContent(job)) {
                contentOut = new MapFile.Writer(job, fs, content.toString(), Text.class, Content.class,
                        compType, progress);
            }

            if (Fetcher.isParsing(job)) {
                parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, progress);
            }
        }

        public void write(Text key, NutchWritable value) throws IOException {

            Writable w = value.get();

            if (w instanceof CrawlDatum)
                fetchOut.append(key, w);
            else if (w instanceof Content)
                contentOut.append(key, w);
            else if (w instanceof Parse)
                parseOut.write(key, (Parse) w);
        }

        public void close(Reporter reporter) throws IOException {
            fetchOut.close();
            if (contentOut != null) {
                contentOut.close();
            }
            if (parseOut != null) {
                parseOut.close(reporter);
            }
        }

    };

}

From source file:org.apache.nutch.parse.ParseOutputFormat.java

License:Apache License

public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {

    if (job.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(job);
    }/*  w w  w . j av  a2s  . com*/

    if (job.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
    }

    this.scfilters = new ScoringFilters(job);
    final int interval = job.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = job.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);
    Path out = FileOutputFormat.getOutputPath(job);

    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);

    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *");

    final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
            CompressionType.RECORD, progress);

    final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class,
            compType, progress);

    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class,
            compType, progress);

    return new RecordWriter<Text, Parse>() {

        public void write(Text key, Parse parse) throws IOException {

            String fromUrl = key.toString();
            String fromHost = null;
            textOut.append(key, new ParseText(parse.getText()));

            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }

            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);

            try {
                ParseStatus pstatus = parseData.getStatus();
                if (pstatus != null && pstatus.isSuccess()
                        && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                    String newUrl = pstatus.getMessage();
                    int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);

                    try {
                        if (normalizers != null) {
                            newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                        }
                    } catch (MalformedURLException mfue) {
                        newUrl = null;
                    }

                    if (filters != null) {
                        if (newUrl != null)
                            newUrl = filters.filter(newUrl);
                    }

                    String url = key.toString();
                    if (newUrl != null && !newUrl.equals(url)) {
                        String reprUrl = URLUtil.chooseRepr(url, newUrl,
                                refreshTime < Fetcher.PERM_REFRESH_TIME);
                        CrawlDatum newDatum = new CrawlDatum();
                        newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                        if (reprUrl != null && !reprUrl.equals(newUrl)) {
                            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                        }
                        crawlOut.append(new Text(newUrl), newDatum);
                    }
                }
            } catch (URLFilterException e) {
                // ignore
            }

            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            if (ignoreExternalLinks) {
                try {
                    fromHost = new URL(fromUrl).getHost().toLowerCase();
                } catch (MalformedURLException e) {
                    fromHost = null;
                }
            } else {
                fromHost = null;
            }

            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();

                // Only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
                            filters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }

                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }

                targets.add(new SimpleEntry(targetUrl, target));

                // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }

            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null,
                        links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);

            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks,
                    parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        public void close(Reporter reporter) throws IOException {
            textOut.close();
            dataOut.close();
            crawlOut.close();
        }

    };

}