Example usage for org.apache.hadoop.io MapWritable remove

List of usage examples for org.apache.hadoop.io MapWritable remove

Introduction

In this page you can find the example usage for org.apache.hadoop.io MapWritable remove.

Prototype

@Override
    public Writable remove(Object key) 

Source Link

Usage

From source file:org.apache.nutch.tools.compat.ReprUrlFixer.java

License:Apache License

/**
 * Runs the new ReprUrl logic on all crawldatums.
 *///w  w  w.jav  a  2 s .c o  m
public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
        Reporter reporter) throws IOException {

    String url = key.toString();
    Node node = null;
    List<CrawlDatum> datums = new ArrayList<CrawlDatum>();

    // get all crawl datums for a given url key, fetch for instance can have
    // more than one under a given key if there are multiple redirects to a
    // given url
    while (values.hasNext()) {
        CrawlDatum datum = values.next();
        datums.add((CrawlDatum) WritableUtils.clone(datum, conf));
    }

    // apply redirect repr url logic for each datum
    for (CrawlDatum datum : datums) {

        MapWritable metadata = datum.getMetaData();
        Text reprUrl = (Text) metadata.get(Nutch.WRITABLE_REPR_URL_KEY);
        byte status = datum.getStatus();
        boolean isCrawlDb = (CrawlDatum.hasDbStatus(datum));
        boolean segFetched = (status == CrawlDatum.STATUS_FETCH_SUCCESS);

        // only if the crawl datum is from the crawldb or is a successfully
        // fetched page from the segments
        if ((isCrawlDb || segFetched) && reprUrl != null) {

            String src = reprUrl.toString();
            String dest = url;
            URL srcUrl = null;
            URL dstUrl = null;

            // both need to be well formed urls
            try {
                srcUrl = new URL(src);
                dstUrl = new URL(url);
            } catch (MalformedURLException e) {
            }

            // if the src and repr urls are the same after the new logic then
            // remove the repr url from the metadata as it is no longer needed
            if (srcUrl != null && dstUrl != null) {
                String reprOut = URLUtil.chooseRepr(src, dest, true);
                if (reprOut.equals(dest)) {
                    LOG.info("Removing " + reprOut + " from " + dest);
                    metadata.remove(Nutch.WRITABLE_REPR_URL_KEY);
                }
            }
        }

        // collect each datum
        output.collect(key, datum);
    }

}