Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.fetcher; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.nio.channels.SelectionKey; import java.nio.channels.Selector; import java.nio.channels.SocketChannel; import java.text.SimpleDateFormat; import java.util.*; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; // Slf4j Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.*; public class NIOFetcher extends NutchTool implements Tool, MapRunnable<Text, CrawlDatum, Text, NutchWritable> { public static final int PERM_REFRESH_TIME = 5; public static final String CONTENT_REDIR = "content"; public static final String PROTOCOL_REDIR = "protocol"; public static final Logger LOG = LoggerFactory.getLogger(NIOFetcher.class); private OutputCollector<Text, NutchWritable> output; private Reporter reporter; private String segmentName; // / TODO: ??????? private AtomicInteger activeThreads = new AtomicInteger(0); private AtomicInteger spinWaiting = new AtomicInteger(0); private long start = System.currentTimeMillis(); // start time of fetcher run private AtomicLong lastRequestStart = new AtomicLong(start); private AtomicLong bytes = new AtomicLong(0); // total bytes fetched private AtomicInteger pages = new AtomicInteger(0); // total pages fetched private AtomicInteger errors = new AtomicInteger(0); // total pages errored private AtomicInteger timeouts = new AtomicInteger(0); private AtomicInteger actives = new AtomicInteger(0); private boolean storingContent; private boolean parsing; FetchItemQueues fetchQueues; QueueFeeder feeder; private static class CrawlState { public int state; public long lastOpTime; public SelectionKey key; public final static byte CONNECT = 0; public final static byte READ = 1; } // / TODO: ?HTTPS?? private class PageHandler implements Runnable { private BlockingQueue<Page> pagesQueue; private final byte[] EMPTY_CONTENT = new byte[0]; private Configuration conf; private URLFilters urlFilters; private URLNormalizers urlNormalizers; private ScoringFilters scfilters; private FetchOutputer fetchOutputer; private RedirectInfo redirectInfo = new RedirectInfo(); // Used by the REST service private FetchNode fetchNode; private boolean reportToNutchServer; public PageHandler(BlockingQueue<Page> pagesQueue, Configuration conf) { this.pagesQueue = pagesQueue; this.conf = conf; urlFilters = new URLFilters(conf); urlNormalizers = new URLNormalizers(conf, ""); this.scfilters = new ScoringFilters(conf); fetchOutputer = new FetchOutputer(conf, fetchQueues, reporter, segmentName, parsing, output, storingContent, redirectInfo, fetchNode); } @Override public void run() { while (true) { try { Page page = pagesQueue.take(); if (page.isEndPage()) break; String url = page.getUri(); // LOG.info("Handling " + url); page.process(); CrawlDatum datum = page.getDatum(); datum.setFetchTime(System.currentTimeMillis()); if (page.isFetchFailed()) { fetchOutputer.output(new Text(url), datum, null, ProtocolStatus.STATUS_RETRY, CrawlDatum.STATUS_FETCH_RETRY); continue; } int code = page.getStatusCode(); int status = CrawlDatum.STATUS_FETCH_RETRY; if (code == 200) { status = CrawlDatum.STATUS_FETCH_SUCCESS; } else if (code == 410) { // page is gone status = CrawlDatum.STATUS_FETCH_GONE; } else if (code >= 300 && code < 400) { // handle redirect // TODO: ??? String newUrl = page.getHeader("Location"); if (newUrl != null) { try { newUrl = urlNormalizers.normalize(newUrl, ""); newUrl = urlFilters.filter(newUrl); if (newUrl != null) { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval()); // fetchOut.append(new Text(newUrl), newDatum); } } catch (IOException e) { } } switch (code) { case 300: // multiple choices, preferred value in Location status = CrawlDatum.STATUS_FETCH_REDIR_PERM; break; case 301: // moved permanently case 305: // use proxy (Location is URL of proxy) status = CrawlDatum.STATUS_FETCH_REDIR_PERM; break; case 302: // found (temporarily moved) case 303: // see other (redirect after POST) case 307: // temporary redirect status = CrawlDatum.STATUS_DB_REDIR_TEMP; break; case 304: // not modified status = CrawlDatum.STATUS_FETCH_NOTMODIFIED; break; default: status = CrawlDatum.STATUS_FETCH_REDIR_PERM; } } else if (code == 400) { // bad request, mark as GONE status = CrawlDatum.STATUS_FETCH_GONE; } else if (code == 401) { // requires authorization, but no valid auth // provided. status = CrawlDatum.STATUS_FETCH_RETRY; } else if (code == 404) { status = CrawlDatum.STATUS_FETCH_GONE; } else if (code == 410) { // permanently GONE status = CrawlDatum.STATUS_FETCH_GONE; } else { status = CrawlDatum.STATUS_FETCH_RETRY; } datum.setStatus(status); byte[] content = page.getContent(); Content c = new Content(page.getUri(), page.getUri(), (content == null ? EMPTY_CONTENT : content), page.getHeader("Content-Type"), page.getHeaders(), conf); fetchOutputer.output(new Text(url), datum, c, null, status); } catch (Exception e) { // XXX: ??crawl-fetch LOG.warn(e.toString()); continue; } } LOG.info("??"); } // private void output(Text key, CrawlDatum datum, Content content, // ProtocolStatus pstatus, int status) { // output(key, datum, content, pstatus, status, 0); // } // // private void output(Text key, CrawlDatum datum, Content content, // ProtocolStatus pstatus, int status, int outlinkDepth) { // // datum.setStatus(status); // datum.setFetchTime(System.currentTimeMillis()); // if (pstatus != null) // datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); // // ParseResult parseResult = null; // if (content != null) { // Metadata metadata = content.getMetadata(); // // // store the guessed content type in the crawldatum // if (content.getContentType() != null) // datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), // new Text(content.getContentType())); // // // add segment to metadata // metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // // add score to content metadata so that ParseSegment can pick it up. // try { // scfilters.passScoreBeforeParsing(key, datum, content); // } catch (Exception e) { // if (LOG.isWarnEnabled()) { // LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); // } // } // // /* // * Store status code in content So we can read this value during parsing // * (as a separate job) and decide to parse or not. // */ // content.getMetadata().add(Nutch.FETCH_STATUS_KEY, // Integer.toString(status)); // } // // try { // output.collect(key, new NutchWritable(datum)); // if (content != null && storingContent) // output.collect(key, new NutchWritable(content)); // } catch (IOException e) { // if (LOG.isErrorEnabled()) { // LOG.error("fetcher caught:" + e.toString()); // } // } // } } private class FetcherThread extends Thread { private Configuration conf; // / IO private Selector selector; private ByteBuffer readBuffer = ByteBuffer.allocate(8192); private Map<String, ByteBuffer> writeBuffers = new HashMap<String, ByteBuffer>(); private Map<String, ByteArrayOutputStream> streams = new HashMap<String, ByteArrayOutputStream>(); private HttpRequestBuilder httpRequestBuilder = new HttpRequestBuilder(); // / select?<=0,? private int selectTimeout = 2000; // / ? // / XXX: actives private int inProgress = 0; // / ? private int maxInProgress = 200; // / ?? private boolean hasUrls = true; // TODO: ? // / private int timeout; // / private int timeoutCheckInterval = 3000; // / private long lastTimeoutCheck = 0; // / ? -> ? private HashMap<String, CrawlState> states = new HashMap<String, CrawlState>(); // / ?? private BlockingQueue<Page> pagesQueue; public FetcherThread(BlockingQueue<Page> pagesQueue, Configuration conf) throws IOException { this.setDaemon(true); // don't hang JVM on exit this.setName("FetcherThread"); // use an informative name this.pagesQueue = pagesQueue; this.conf = conf; selectTimeout = conf.getInt("http.nio.timeout.select", 1000); timeout = conf.getInt("http.nio.timeout", 30000); maxInProgress = conf.getInt("http.nio.request.max", 200); timeoutCheckInterval = conf.getInt("http.nio.timeout.interval", 4000); selector = Selector.open(); } public void fetch() { while (true) { initiateNewConnections(); // int nb; try { if (selectTimeout <= 0) nb = selector.selectNow(); else nb = selector.select(selectTimeout); } catch (IOException e) { LOG.warn("nio select: " + e.toString()); continue; } // if (nb == 0) // if (System.currentTimeMillis() - lastTimeoutCheck >= timeoutCheckInterval) { lastTimeoutCheck = System.currentTimeMillis(); checkTimeout(); } if (inProgress == 0 && !hasUrls) // ??? break; // ?? Iterator<SelectionKey> iter = selector.selectedKeys().iterator(); while (iter.hasNext()) { SelectionKey key = iter.next(); iter.remove(); // if (!key.isValid()) { // continue; // } try { if (key.isConnectable()) { // ? connect(key); } else if (key.isWritable()) { // ???HTTP write(key); } else if (key.isReadable()) { // ???? read(key); } } catch (Exception e) { FetchItem att = (FetchItem) key.attachment(); String url = att.getUrl().toString(); // LOG.warn(url + ": " + e); logError(new Text(url), e.toString()); finishChannel(key, CrawlDatum.STATUS_FETCH_RETRY, null); } } // checkTimeout(); } LOG.info("?"); pagesQueue.add(Page.EndPage); } /** * */ private void checkTimeout() { LOG.info("..."); HashMap<String, CrawlState> copyStates = new HashMap<String, CrawlState>(); for (Map.Entry<String, CrawlState> e : states.entrySet()) { copyStates.put(e.getKey(), e.getValue()); } long current = System.currentTimeMillis(); for (Map.Entry<String, CrawlState> e : copyStates.entrySet()) { CrawlState s = e.getValue(); if (current - s.lastOpTime > timeout) { // LOG.warn(e.getKey() + ": time out"); timeouts.incrementAndGet(); finishChannel(s.key, CrawlDatum.STATUS_FETCH_RETRY, null); } // if (s.State == 0 && current - s.lastOp > connectTimeout) // { // LOG.warn(e.getKey() + ": connect time out"); // finishChannel(s.key, CrawlDatum.STATUS_FETCH_RETRY, null); // errConnect++; // } // else if (s.State == 2 && current - s.lastOp > readTimeout) // { // LOG.warn(e.getKey() + ": read time out"); // // finishChannel(s.key, CrawlDatum.STATUS_FETCH_RETRY, null); // errRead++; // } } copyStates.clear(); } /** * ?? */ private void initiateNewConnections() { while (inProgress < maxInProgress && hasUrls) { FetchItem fit = fetchQueues.getFetchItem(); if (fit == null) { if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) { // ?? LOG.debug(getName() + " spin-waiting ..."); spinWaiting.incrementAndGet(); // // XXX: ???? // try { // Thread.sleep(500); // } catch (Exception e) { // } // // XXX: ??? // spinWaiting.decrementAndGet(); // // XXX: // break; //return; } else { // ? hasUrls = false; LOG.info("Thread " + getName() + " has no more work available"); //return; } return; } lastRequestStart.set(System.currentTimeMillis()); // / TODO: IP?? SocketChannel socketChannel = null; SelectionKey key = null; try { int port = fit.u.getPort(); port = port > 0 ? port : 80; String ipStr = IPUtils.getIPString(fit.datum); // XXX: IP, CrawlDatum? // if (ipStr.length() == 0) { // noip++; // // LOG.warn(url + ": IP"); // Page page = new Page(url.toString(), datum); // // pagesQueue.add(page); // continue; // } // LOG.info("Connecting " + url.toString() + "..."); // TODO: robots.txt InetSocketAddress ia = null; InetAddress addr = IPUtils.toIP(ipStr); ia = new InetSocketAddress(addr, port); // if (ia.isUnresolved()) // continue; // LOG.info("Fetching " + fit.url); socketChannel = SocketChannel.open(); socketChannel.configureBlocking(false); socketChannel.connect(ia); key = socketChannel.register(selector, SelectionKey.OP_CONNECT); // Attachment att = new Attachment(); // att.url = fit.url.toString(); // att.datum = fit.datum; key.attach(fit); streams.put(fit.url.toString(), new ByteArrayOutputStream()); CrawlState s = new CrawlState(); s.key = key; s.state = CrawlState.CONNECT; s.lastOpTime = System.currentTimeMillis(); states.put(fit.url.toString(), s); // XXX: ???? inProgress++; actives.incrementAndGet(); } catch (IOException e) { // TODO: ?? fetchQueues.finishFetchItem(fit); // LOG.warn(fit.u + ": " + e); logError(fit.url, e.toString()); if (key != null) key.cancel(); if (socketChannel != null) { try { socketChannel.close(); } catch (IOException e1) { LOG.info(e.toString()); } } } } } /** * ? * * @param key * * @throws IOException */ private void connect(SelectionKey key) throws IOException { SocketChannel socketChannel = (SocketChannel) key.channel(); FetchItem att = (FetchItem) key.attachment(); String url = att.getUrl().toString(); socketChannel.finishConnect(); // LOG.info(url + ": connected"); key.interestOps(SelectionKey.OP_WRITE); // CrawlState stat = states.get(url); // stat.lastOpTime = System.currentTimeMillis(); updateState(url); } /** * ?? * * @param url * */ private void updateState(String url) { // TODO: ?? CrawlState stat = states.get(url); stat.lastOpTime = System.currentTimeMillis(); } /** * ??HTTP? * * @param key * * @throws IOException */ private void write(SelectionKey key) throws IOException { FetchItem att = (FetchItem) key.attachment(); String url = att.getUrl().toString(); SocketChannel socketChannel = (SocketChannel) key.channel(); ByteBuffer writeBuffer = writeBuffers.get(url); if (writeBuffer == null) { String getRequest = httpRequestBuilder.buildGet(url); writeBuffer = ByteBuffer.wrap(getRequest.getBytes()); writeBuffers.put(url, writeBuffer); } socketChannel.write(writeBuffer); if (!writeBuffer.hasRemaining()) { writeBuffers.remove(url); key.interestOps(SelectionKey.OP_READ); } // CrawlState stat = states.get(url); // stat.lastOp = System.currentTimeMillis(); updateState(url); // LOG.info(url + ": requested"); } /** * ??? * * @param key * * @throws IOException */ private void read(SelectionKey key) throws IOException { FetchItem att = (FetchItem) key.attachment(); String url = att.getUrl().toString(); SocketChannel socketChannel = (SocketChannel) key.channel(); readBuffer.clear(); int numRead = 0; numRead = socketChannel.read(readBuffer); // CrawlState stat = states.get(url); // stat.lastOp = System.currentTimeMillis(); // stat.State = 2; updateState(url); if (numRead > 0) { streams.get(url).write(readBuffer.array(), 0, numRead); // bytes += numRead; // bytesNow += numRead; } else if (numRead == -1) { ByteArrayOutputStream stream = streams.remove(url); finishChannel(key, CrawlDatum.STATUS_FETCH_SUCCESS, stream.toByteArray()); // LOG.info(url + ": finished***"); } } /** * ?? * * @param key * * @param status * ? * @param bytes * ??? */ private void finishChannel(SelectionKey key, int status, byte[] bytes) { FetchItem att = (FetchItem) key.attachment(); String url = att.getUrl().toString(); fetchQueues.finishFetchItem(att); Page page = null; if (bytes == null) page = new Page(url, att.datum); else page = new Page(url, att.datum, bytes); pagesQueue.add(page); if (status == CrawlDatum.STATUS_FETCH_SUCCESS) updateStatus(bytes.length); // // LOG.info(url + ": ?"); try { key.channel().close(); // inProgress--; } catch (IOException e) { LOG.warn(url + ": " + e); } key.cancel(); states.remove(url); streams.remove(url); actives.decrementAndGet(); inProgress--; // sockets--; // // closed++; } @SuppressWarnings("fallthrough") public void run() { // TODO: activeThreads???? activeThreads.incrementAndGet(); // count threads fetch(); activeThreads.decrementAndGet(); // count threads LOG.info("-finishing fetcher thread, activeRequests=" + actives); } private void logError(Text url, String message) { if (LOG.isInfoEnabled()) { LOG.info("fetch of " + url + " failed with: " + message); } errors.incrementAndGet(); } } public NIOFetcher() { super(null); } public NIOFetcher(Configuration conf) { super(conf); } private void updateStatus(int bytesInPage) { pages.incrementAndGet(); bytes.addAndGet(bytesInPage); } private void reportStatus(int pagesLastSec, int bytesLastSec) throws IOException { StringBuilder status = new StringBuilder(); Long elapsed = new Long((System.currentTimeMillis() - start) / 1000); float avgPagesSec = (float) pages.get() / elapsed.floatValue(); long avgBytesSec = (bytes.get() / 125l) / elapsed.longValue(); status.append(actives).append(" requests (").append(spinWaiting.get()).append(" waiting), "); status.append(fetchQueues.getQueueCount()).append(" queues, "); status.append(fetchQueues.getTotalSize()).append(" URLs queued, "); status.append(pages).append(" pages, ").append(errors).append(" errors, ").append(timeouts) .append(" timeouts, "); status.append(String.format("%.2f", avgPagesSec)).append(" pages/s ("); status.append(pagesLastSec).append(" last sec), "); status.append(avgBytesSec).append(" kbits/s (").append((bytesLastSec / 125)).append(" last sec)"); LOG.info(status.toString()); reporter.setStatus(status.toString()); } public void configure(JobConf job) { setConf(job); this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY); this.storingContent = isStoringContent(job); this.parsing = isParsing(job); // if (job.getBoolean("fetcher.verbose", false)) { // LOG.setLevel(Level.FINE); // } } public void close() { } public static boolean isParsing(Configuration conf) { return conf.getBoolean("fetcher.parse", true); } public static boolean isStoringContent(Configuration conf) { return conf.getBoolean("fetcher.store.content", true); } public void run(RecordReader<Text, CrawlDatum> input, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { this.output = output; this.reporter = reporter; this.fetchQueues = new FetchItemQueues(getConf()); int threadCount = getConf().getInt("fetcher.threads.fetch", 10); int timeoutDivisor = getConf().getInt("fetcher.threads.timeout.divisor", 2); if (LOG.isInfoEnabled()) { LOG.info("NIOFetcher: time-out divisor: " + timeoutDivisor); } int queueDepthMuliplier = getConf().getInt("fetcher.queue.depth.multiplier", 50); feeder = new QueueFeeder(input, fetchQueues, threadCount * queueDepthMuliplier); // feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2); // the value of the time limit is either -1 or the time where it should // finish long timelimit = getConf().getLong("fetcher.timelimit", -1); if (timelimit != -1) feeder.setTimeLimit(timelimit); feeder.start(); // ?? BlockingQueue<Page> pagesQueue = new LinkedBlockingQueue<Page>(); // ? FetcherThread t = new FetcherThread(pagesQueue, getConf()); t.start(); // ?? PageHandler parser = new PageHandler(pagesQueue, getConf()); Thread thread = new Thread(parser); thread.start(); // select a timeout that avoids a task timeout long timeout = getConf().getInt("mapred.task.timeout", 10 * 60 * 1000) / timeoutDivisor; // Used for threshold check, holds pages and bytes processed in the last // second int pagesLastSec; int bytesLastSec; // Set to true whenever the threshold has been exceeded for the first time boolean throughputThresholdExceeded = false; int throughputThresholdNumRetries = 0; int throughputThresholdPages = getConf().getInt("fetcher.throughput.threshold.pages", -1); if (LOG.isInfoEnabled()) { LOG.info("NIOFetcher: throughput threshold: " + throughputThresholdPages); } int throughputThresholdMaxRetries = getConf().getInt("fetcher.throughput.threshold.retries", 5); if (LOG.isInfoEnabled()) { LOG.info("NIOFetcher: throughput threshold retries: " + throughputThresholdMaxRetries); } long throughputThresholdTimeLimit = getConf().getLong("fetcher.throughput.threshold.check.after", -1); do { // wait for threads to exit pagesLastSec = pages.get(); bytesLastSec = (int) bytes.get(); try { Thread.sleep(1000); } catch (InterruptedException e) { } pagesLastSec = pages.get() - pagesLastSec; bytesLastSec = (int) bytes.get() - bytesLastSec; reporter.incrCounter("FetcherStatus", "bytes_downloaded", bytesLastSec); reportStatus(pagesLastSec, bytesLastSec); LOG.info("-activeRequests=" + actives + ", spinWaiting=" + spinWaiting.get() + ", fetchQueues.totalSize=" + fetchQueues.getTotalSize() + ", fetchQueues.getQueueCount=" + fetchQueues.getQueueCount()); if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) { fetchQueues.dump(); } // if throughput threshold is enabled if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) { // Check if we're dropping below the threshold if (pagesLastSec < throughputThresholdPages) { throughputThresholdNumRetries++; LOG.warn(Integer.toString(throughputThresholdNumRetries) + ": dropping below configured threshold of " + Integer.toString(throughputThresholdPages) + " pages per second"); // Quit if we dropped below threshold too many times if (throughputThresholdNumRetries == throughputThresholdMaxRetries) { LOG.warn("Dropped below threshold too many times, killing!"); // Disable the threshold checker throughputThresholdPages = -1; // Empty the queues cleanly and get number of items that were // dropped int hitByThrougputThreshold = fetchQueues.emptyQueues(); if (hitByThrougputThreshold != 0) reporter.incrCounter("FetcherStatus", "hitByThrougputThreshold", hitByThrougputThreshold); } } } // / TODO:??? // check timelimit if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) reporter.incrCounter("FetcherStatus", "hitByTimeLimit", hitByTimeLimit); } // some requests seem to hang, despite all intentions if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { if (LOG.isWarnEnabled()) { // LOG.warn("Aborting with " + activeThreads + " hung threads."); LOG.warn("Aborting with " + actives + " hung requests."); } return; } } while (activeThreads.get() > 0); // LOG.info("-activeThreads=" + activeThreads); LOG.info("-activeRequests=" + actives); } public void fetch(Path segment) throws IOException { checkConfiguration(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("NIOFetcher: starting at " + sdf.format(start)); LOG.info("NIOFetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is // enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("NIOFetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("NIOFetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(FetcherInputFormat.class); job.setMapRunnerClass(NIOFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("NIOFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } /** Run the fetcher. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new NIOFetcher(), args); System.exit(res); } public int run(String[] args) throws Exception { String usage = "Usage: NIOFetcher <segment>"; if (args.length < 1) { System.err.println(usage); return -1; } Path segment = new Path(args[0]); try { fetch(segment); return 0; } catch (Exception e) { LOG.error("Fetcher: " + StringUtils.stringifyException(e)); return -1; } } private void checkConfiguration() { // ensure that a value has been set for the agent name String agentName = getConf().get("http.agent.name"); if (agentName == null || agentName.trim().length() == 0) { String message = "Fetcher: No agents listed in 'http.agent.name'" + " property."; if (LOG.isErrorEnabled()) { LOG.error(message); } throw new IllegalArgumentException(message); } } @Override public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception { Map<String, Object> results = new HashMap<String, Object>(); String RESULT = "result"; String segment_dir = crawlId + "/segments"; File segmentsDir = new File(segment_dir); File[] segmentsList = segmentsDir.listFiles(); Arrays.sort(segmentsList, new Comparator<File>() { @Override public int compare(File f1, File f2) { if (f1.lastModified() > f2.lastModified()) return -1; else return 0; } }); Path segment = new Path(segmentsList[0].getPath()); try { fetch(segment); results.put(RESULT, Integer.toString(0)); return results; } catch (Exception e) { LOG.error("NIOFetcher: " + StringUtils.stringifyException(e)); results.put(RESULT, Integer.toString(-1)); return results; } } }