List of usage examples for org.apache.hadoop.io MapWritable remove
@Override
public Writable remove(Object key)
From source file:org.apache.nutch.tools.compat.ReprUrlFixer.java
License:Apache License
/** * Runs the new ReprUrl logic on all crawldatums. *///w w w.jav a 2 s .c o m public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = key.toString(); Node node = null; List<CrawlDatum> datums = new ArrayList<CrawlDatum>(); // get all crawl datums for a given url key, fetch for instance can have // more than one under a given key if there are multiple redirects to a // given url while (values.hasNext()) { CrawlDatum datum = values.next(); datums.add((CrawlDatum) WritableUtils.clone(datum, conf)); } // apply redirect repr url logic for each datum for (CrawlDatum datum : datums) { MapWritable metadata = datum.getMetaData(); Text reprUrl = (Text) metadata.get(Nutch.WRITABLE_REPR_URL_KEY); byte status = datum.getStatus(); boolean isCrawlDb = (CrawlDatum.hasDbStatus(datum)); boolean segFetched = (status == CrawlDatum.STATUS_FETCH_SUCCESS); // only if the crawl datum is from the crawldb or is a successfully // fetched page from the segments if ((isCrawlDb || segFetched) && reprUrl != null) { String src = reprUrl.toString(); String dest = url; URL srcUrl = null; URL dstUrl = null; // both need to be well formed urls try { srcUrl = new URL(src); dstUrl = new URL(url); } catch (MalformedURLException e) { } // if the src and repr urls are the same after the new logic then // remove the repr url from the metadata as it is no longer needed if (srcUrl != null && dstUrl != null) { String reprOut = URLUtil.chooseRepr(src, dest, true); if (reprOut.equals(dest)) { LOG.info("Removing " + reprOut + " from " + dest); metadata.remove(Nutch.WRITABLE_REPR_URL_KEY); } } } // collect each datum output.collect(key, datum); } }