Example usage for org.apache.hadoop.io MapFile.Writer MapFile.Writer

List of usage examples for org.apache.hadoop.io MapFile.Writer MapFile.Writer

Introduction

In this page you can find the example usage for org.apache.hadoop.io MapFile.Writer MapFile.Writer.

Prototype

@Deprecated
public Writer(Configuration conf, FileSystem fs, String dirName, WritableComparator comparator,
        Class valClass, SequenceFile.CompressionType compress, Progressable progress) throws IOException 

Source Link

Document

Create the named map using the named key comparator.

Usage

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

private void writeMapFile() throws Exception {
    Path path = Testfile.MAPFILE.filepath();

    Text key = new Text();
    Text value = new Text();

    long fsMinBlockSize = conf.getLong("dfs.namenode.fs-limits.min-block-size", 0);

    long testBlockSize = (blockSize < fsMinBlockSize) ? fsMinBlockSize : (long) blockSize;

    MapFile.Writer writer = new MapFile.Writer(conf, path, MapFile.Writer.keyClass(key.getClass()),
            MapFile.Writer.valueClass(value.getClass()),
            MapFile.Writer.compression(SequenceFile.CompressionType.NONE),
            SequenceFile.Writer.blockSize(testBlockSize), SequenceFile.Writer.bufferSize((int) testBlockSize));
    for (int i = 0; i < testSize; i++) {
        key.set(getKey(i));//  ww w .j av  a 2  s .co m
        value.set(getValue());
        writer.append(key, value);
    }
    IOUtils.closeStream(writer);
}

From source file:org.apache.nutch.parse.ParseOutputFormat.java

License:Apache License

public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {

    if (job.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(job);
    }/*from  w  ww .  j a  v a 2 s  .c  om*/

    if (job.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
    }

    this.scfilters = new ScoringFilters(job);
    final int interval = job.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = job.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);
    Path out = FileOutputFormat.getOutputPath(job);

    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);

    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *");

    final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
            CompressionType.RECORD, progress);

    final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class,
            compType, progress);

    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class,
            compType, progress);

    return new RecordWriter<Text, Parse>() {

        public void write(Text key, Parse parse) throws IOException {

            String fromUrl = key.toString();
            String fromHost = null;
            textOut.append(key, new ParseText(parse.getText()));

            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }

            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);

            try {
                ParseStatus pstatus = parseData.getStatus();
                if (pstatus != null && pstatus.isSuccess()
                        && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                    String newUrl = pstatus.getMessage();
                    int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);

                    try {
                        if (normalizers != null) {
                            newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                        }
                    } catch (MalformedURLException mfue) {
                        newUrl = null;
                    }

                    if (filters != null) {
                        if (newUrl != null)
                            newUrl = filters.filter(newUrl);
                    }

                    String url = key.toString();
                    if (newUrl != null && !newUrl.equals(url)) {
                        String reprUrl = URLUtil.chooseRepr(url, newUrl,
                                refreshTime < Fetcher.PERM_REFRESH_TIME);
                        CrawlDatum newDatum = new CrawlDatum();
                        newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                        if (reprUrl != null && !reprUrl.equals(newUrl)) {
                            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                        }
                        crawlOut.append(new Text(newUrl), newDatum);
                    }
                }
            } catch (URLFilterException e) {
                // ignore
            }

            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            if (ignoreExternalLinks) {
                try {
                    fromHost = new URL(fromUrl).getHost().toLowerCase();
                } catch (MalformedURLException e) {
                    fromHost = null;
                }
            } else {
                fromHost = null;
            }

            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();

                // Only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
                            filters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }

                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }

                targets.add(new SimpleEntry(targetUrl, target));

                // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }

            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null,
                        links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);

            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks,
                    parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        public void close(Reporter reporter) throws IOException {
            textOut.close();
            dataOut.close();
            crawlOut.close();
        }

    };

}