org.commoncrawl.service.crawler.CrawlList.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.crawler.CrawlList.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.commoncrawl.service.crawler;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.concurrent.Callable;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.CRC32;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.util.GZIPUtils;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.crawler.PersistentCrawlTarget;
import org.commoncrawl.service.crawler.RobotRulesParser.RobotRuleSet;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.service.statscollector.CrawlerStats;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FileUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.IntrusiveList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.GZIPUtils.UnzipResult;
import org.commoncrawl.util.IntrusiveList.IntrusiveListElement;
import org.junit.Test;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

/**
 * CrawlList - a collection of CrawlTargets (disk backed)
 * 
 * @author rana
 *
 */
public final class CrawlList extends IntrusiveList.IntrusiveListElement<CrawlList> {

    /** offline (disk) storage support **/
    private static int DISK_FLUSH_THRESHOLD = 50;
    private static int DISK_LOAD_THRESHOLD = 10;
    private static int IDEAL_TARGET_COUNT = 25;
    private static final int MAX_ROBOTS_EXCLUSION_IN_LOOP = 3;
    private static final int MAX_FAILED_TARGETS_IN_LOOP = 50;
    private static final int IOEXCEPTION_TIMEOUT_BOOST = 60000;
    private static final int PAUSE_STATE_RETRY_DELAY = 10 * 60000; // 10 minutes

    static final AtomicLong seq = new AtomicLong();

    public static class DiskQueueEntry implements Comparable<DiskQueueEntry> {

        final long seqNum;
        final CrawlList entry;
        final boolean isLoadRequest;

        public DiskQueueEntry(CrawlList item, boolean isLoadRequest) {
            this.seqNum = seq.getAndIncrement();
            this.entry = item;
            this.isLoadRequest = isLoadRequest;
        }

        public CrawlList getListItem() {
            return entry;
        }

        @Override
        public int compareTo(DiskQueueEntry o) {
            if (entry == null && o.entry != null)
                return -1;
            else if (o.entry == null && entry != null)
                return 1;
            else {
                if (isLoadRequest && !o.isLoadRequest)
                    return -1;
                else if (!isLoadRequest && o.isLoadRequest)
                    return 1;
                else {
                    return ((Long) seqNum).compareTo(o.seqNum);
                }
            }
        }
    }

    private static PriorityBlockingQueue<DiskQueueEntry> _diskOperationQueue = new PriorityBlockingQueue<DiskQueueEntry>();
    private static Thread _diskOperationThread = null;
    private static boolean _diskOpThreadShuttingDown = false;
    private static long _diskHeaderActiveVersionTimestamp = System.currentTimeMillis();

    /** logging **/
    private static final Log LOG = LogFactory.getLog(CrawlList.class);

    /** server repsonsible for servicing this domain **/
    private CrawlListHost _host;
    /** host name **/
    private String _listName;
    /** host metadata */
    private int _baseListId;
    /** unique list id **/
    private long _uniqueListId;
    /** next crawl interface used to service this list **/
    private int _nextCrawlInterface = 0;
    /** cumilative list of crawl targets associated with this queue ...*/
    private IntrusiveList<CrawlTarget> _pending = new IntrusiveList<CrawlTarget>();
    /** list of crawl targets directly scheduled for disk queue */
    private IntrusiveList<CrawlTarget> _queued = new IntrusiveList<CrawlTarget>();

    /** offline item count  - the set of crawl targets that are stored offline on disk **/
    private int _offlineTargetCount = 0;
    /** disk request pending **/
    private boolean _diskRequestPending = false;

    /** currently scheduled item **/
    private CrawlTarget _scheduled = null;
    /** fetch start time **/
    private long _fetchStartTime = -1;
    /** fetch end time **/
    private long _fetchEndTime = -1;
    /** last  successful fetch time (total time in milliseconds)**/
    private int _lastRequestDownloadTime = -1;
    /** last successful request redirect count **/
    private int _lastRequestRedirectCount = 0;

    /** active connection **/
    private NIOHttpConnection _activeConnection;

    /** SubDomain Stats and Robots State Information Structure **/
    private static class DomainInfo extends IntrusiveListElement<DomainInfo> {
        public String _domainName;
        public boolean _domainFailed = false;
        public boolean _domainBlackListed = false;
        public long _robotsCRC = -1;
        public long _lastTouched;
        /** host retry counter **/
        public byte _domainRetryCounter = 0;
        /** total 400 errors **/
        public int _HTTP400Count = 0;
        /** total 500 errors **/
        public int _HTTP500Count = 0;
        /** total 200 status code count**/
        public int _HTTP200Count = 0;
        /** sequential failure count **/
        public short _SequentialHTTPFailuresCount = 0;

        public boolean _robotsReturned400;

        public boolean _robotsReturned403;
    }

    /** domain info map **/
    private IntrusiveList<DomainInfo> _domainInfo = new IntrusiveList<DomainInfo>();
    /** active domain info **/
    private DomainInfo _activeDomainInfo;

    /** the active robots rule set to apply robots policy for this host **/
    private RobotRuleSet _ruleSet = null;
    /** robots returned 400 **/
    private boolean _robotsReturned400;
    private boolean _robotsReturned403;

    /** the crc value for the active rule set - computed at pre-parse time**/
    private long _robotsCRC = 0;
    /** robots file retrieved */
    private boolean _robotsRetrieved;
    /** robots host name **/
    private String _robotsHostName;
    /** last fetched robots host Name**/
    private String _lastFetchedRobotsHostName;
    /** last fetched robots data **/
    private String _lastFetchedRobotsData;
    /** crc calculator **/
    private static CRC32 _crc32 = new CRC32();
    /** last request was io exception **/
    private boolean _lastRequestWasIOException = false;

    private static final int MAX_ITEM_RETRY = 2;

    private static final int MAX_HOST_RETRY = 7;

    private static final int DEFAULT_ITEM_RETRY_WAIT_TIME = 20000;

    private static final int DEFAULT_HOST_RETRY_WAIT_TIME = 20000;

    private static final int MIN_CRAWL_DELAY = 1;
    private static final int MAX_CRAWL_DELAY = 3500;

    private static int STATS_CHECK_CODE_SAMPLE_THRESHOLD = 50; // don't do anything util we have retireved at least 50 urls

    private static float BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD = .80f; // if 85% of urls are bad, fail the domain 

    // private static int   SEQUENTIAL_FAILURES_ON_403_ROBOTS_TRIGGER = 500;
    private static int SEQUENTIAL_FAILURES_NO_200_TRIGGER = 10; // if we get 20 sequential failures we bail
    // private static int   SEQUENTIAL_FAILURES_SOME_200_TRIGGER = 1000;

    private static final int MAX_DNS_CACHE_ITEMS = 100;
    private static int MAX_DOMAIN_CACHE_ENTIRES = 1000;

    /** the reference to the singleton server object **/
    private static CrawlerServer _server = null;

    public enum Disposition {
        ItemAvailable, WaitingOnCompletion, WaitingOnTime, QueueEmpty
    }

    /** domain's dipsition(state) **/
    private Disposition _disposition;

    private static byte WWWRULE_Remove = 1 << 0;
    private static byte WWWRULE_Add = 1 << 1;

    /** www rewrite rule patterns **/
    static class WWWReWriteItem extends IntrusiveList.IntrusiveListElement<WWWReWriteItem> {

        public WWWReWriteItem(String domainName, byte ruleType) {
            _wwwRuleDomain = domainName;
            _wwwRuleType = ruleType;
            _lastUpdateTime = System.currentTimeMillis();
        }

        public String _wwwRuleDomain = null;
        public long _lastUpdateTime = -1;
        public byte _wwwRuleType = 0;
    };

    private static final int MAX_REWRITE_ITEMS = 5;

    IntrusiveList<WWWReWriteItem> _rewriteItemList = null;

    static class DNSCacheItem extends IntrusiveList.IntrusiveListElement<DNSCacheItem> {

        public DNSCacheItem(String hostName, int ipAddress, long ttl) {
            _hostName = hostName;
            _ipAddress = ipAddress;
            _ttl = ttl;
            _lastAccessTime = System.currentTimeMillis();
        }

        public String _hostName;
        public int _ipAddress;
        public long _ttl;
        public long _lastAccessTime = -1;
    }

    IntrusiveList<DNSCacheItem> _dnsCacheItem = new IntrusiveList<DNSCacheItem>();

    public void cacheDNSEntry(String hostName, int ipAddress, long ttl) {
        DNSCacheItem oldestItem = null;
        DNSCacheItem found = null;
        for (DNSCacheItem item : _dnsCacheItem) {
            if (item._hostName.equals(hostName)) {
                item._ipAddress = ipAddress;
                item._ttl = ttl;
                item._lastAccessTime = System.currentTimeMillis();
                found = item;
            }
            oldestItem = (oldestItem == null) ? item
                    : (oldestItem._lastAccessTime > item._lastAccessTime) ? item : oldestItem;
        }

        if (found == null) {
            if (_dnsCacheItem.size() == MAX_DNS_CACHE_ITEMS) {
                //LOG.info("###DNS Cache Full for Host:" + getListName() + " Flushing Host:" + oldestItem._hostName);
                _dnsCacheItem.removeElement(oldestItem);
            }
            _dnsCacheItem.addHead(new DNSCacheItem(hostName, ipAddress, ttl));
        } else {
            _dnsCacheItem.removeElement(found);
            _dnsCacheItem.addHead(found);
        }
    }

    private void addWWWReWriteItem(String originalItem, byte itemType) {
        WWWReWriteItem oldestItem = null;
        WWWReWriteItem found = null;
        if (_rewriteItemList == null) {
            _rewriteItemList = new IntrusiveList<WWWReWriteItem>();
        }

        for (WWWReWriteItem item : _rewriteItemList) {
            if (item._wwwRuleDomain.equals(originalItem)) {
                item._lastUpdateTime = System.currentTimeMillis();
                found = item;
            }
            oldestItem = (oldestItem == null) ? item
                    : (oldestItem._lastUpdateTime > item._lastUpdateTime) ? item : oldestItem;
        }
        if (found == null) {
            if (_rewriteItemList.size() == MAX_REWRITE_ITEMS) {
                _rewriteItemList.removeElement(oldestItem);
            }
            _rewriteItemList.addHead(new WWWReWriteItem(originalItem, itemType));
        }
        if (found != null && found != _rewriteItemList.getHead()) {
            _rewriteItemList.removeElement(found);
            _rewriteItemList.addHead(found);
        }
    }

    public CrawlList(CrawlListHost crawlHost, int baseListId) {
        _host = crawlHost;
        _baseListId = baseListId;
        _uniqueListId = (((long) crawlHost.getIPAddress()) << 32) | _baseListId;
        _listName = "List:" + _baseListId + " For:"
                + IPAddressUtils.IntegerToIPAddressString(crawlHost.getIPAddress());
        _robotsRetrieved = false;
        _disposition = Disposition.QueueEmpty;
    }

    /** host access **/
    public CrawlListHost getHost() {
        return _host;
    }

    /** server access **/
    static CrawlerServer getServerSingleton() {
        return _server;
    }

    static void setServerSingleton(CrawlerServer server) {
        _server = server;
    }

    public int getListId() {
        return _baseListId;
    }

    public long getUniqueListId() {
        return _uniqueListId;
    }

    /** host name **/
    public String getListName() {
        return _listName;
    }

    /** disposition **/
    Disposition getDisposition() {
        return _disposition;
    }

    void updateLastModifiedTime(long time) {
        _host.updateLastModifiedTime(time);
    }

    /** get next crawl interface used to service this list 
     * 
     */
    public int getNextCrawlInterface() {
        return _nextCrawlInterface;
    }

    /** set next crawl interface to use for this list 
     * 
     */
    public void setNextCrawlInterface(int crawlInterface) {
        _nextCrawlInterface = crawlInterface;
    }

    /** get the pending urls count **/
    synchronized int getPendingURLCount() {
        return _pending.size();
    }

    /** get the offline url count 
     */
    synchronized int getOfflineURLCount() {
        return _offlineTargetCount;
    }

    boolean isScheduled() {
        return _scheduled != null;
    }

    int getActiveURLCount() {
        return (_scheduled != null) ? 1 : 0;
    }

    void updateLastFetchStartTime(long newTime) {
        _fetchStartTime = newTime;
        getHost().updateLastFetchStartTime(newTime);
    }

    long getLastFetchStartTime() {
        return _fetchStartTime;
    }

    /** get fetch time in milliseconds for last request **/
    int getLastRequestFetchTime() {
        if (_fetchStartTime != -1 && _fetchEndTime != -1) {
            return (int) Math.max(0, _fetchEndTime - _fetchStartTime);
        }
        return 0;
    }

    int getLastSuccessfulDownloadTime() {
        return _lastRequestDownloadTime;
    }

    public synchronized void stopCrawl() {

        // add anything scheduled to pending ... 
        if (_scheduled != null) {
            _pending.addTail(_scheduled);
            _scheduled = null;
        }
        if (_pending.size() != 0 || !_robotsRetrieved) {
            _disposition = Disposition.ItemAvailable;
        } else {
            _disposition = Disposition.QueueEmpty;
        }
    }

    /** add a new crawl target to the host queue **/
    public synchronized void addCrawlTarget(CrawlTarget target, boolean toFrontOfQueue) {

        // LOG.info("DOMAIN:" + this.getHostName() + " ADDING TGT:" + target.getOriginalURL());

        Disposition oldDisposition = _disposition;

        if (toFrontOfQueue) {
            target.setFlags(target.getFlags() | CrawlURL.Flags.IsHighPriorityURL);
            _pending.addHead(target);
        } else {
            // if offline != 0 add to pending 
            if (_offlineTargetCount == 0) {
                _pending.addTail(target);
            }
            // otherwise add to queued 
            else {
                //LOG.info("### QUEUED Adding to Queued List for CrawlList:" + getListName());
                _queued.addTail(target);
            }
        }

        if (_disposition == Disposition.QueueEmpty) {
            _disposition = Disposition.ItemAvailable;
        }

        if (oldDisposition != _disposition && getHost().getActiveList() == this) {
            getHost().listDispositionChanged(this, oldDisposition, _disposition);
        }

        if (_pending.size() >= DISK_FLUSH_THRESHOLD || _queued.size() != 0) {
            if (!_diskRequestPending) {
                _diskRequestPending = true;
                _diskOperationQueue.add(new DiskQueueEntry(this, false));
            }
        }
    }

    private boolean activeDomainRequiresRobotsFetch(String activeDomainName) {

        if (_robotsRetrieved && _robotsHostName != null && _robotsHostName.equalsIgnoreCase(activeDomainName)) {
            // no, the active robots file matches the active domain name. no need to fetch anything ... 
            return false;
        } else {

            DomainInfo domainInfo = getDomainInfoFromDomain(activeDomainName);

            // get the cached crc for the active domain if it exists ... 
            long cachedRobotsCRC = (domainInfo == null) ? -1 : domainInfo._robotsCRC;

            // if cached crc found ...   
            if (cachedRobotsCRC != -1) {
                // if cached robots file matches the actvie robots file's crc ... 
                if (_robotsRetrieved && cachedRobotsCRC == _robotsCRC) {
                    //LOG.info("### Skipping Robots Fetch. Cached CRC == robotsCRC");
                    // no need to refetch 
                    return false;
                }
                // otherwise, check the host's cache ... 
                else {
                    // special case for the empty rule set 
                    if (cachedRobotsCRC == 0) {

                        _robotsCRC = cachedRobotsCRC;
                        _robotsHostName = activeDomainName;
                        _robotsReturned400 = domainInfo._robotsReturned400;
                        _robotsReturned403 = domainInfo._robotsReturned403;
                        _ruleSet = RobotRulesParser.getEmptyRules();
                        _robotsRetrieved = true;
                        if (Environment.detailLogEnabled())
                            LOG.info("### Skipping Robots Fetch. Cached CRC is Zero, indicating empty rule set.");

                        return false;
                    } else {

                        // check the rule set cache in the host (by crc)
                        RobotRuleSet ruleSet = _host.getCachedRobotsEntry(cachedRobotsCRC);
                        // if cached object found .... 
                        if (ruleSet != null) {
                            _robotsCRC = cachedRobotsCRC;
                            _robotsHostName = activeDomainName;
                            _robotsReturned400 = domainInfo._robotsReturned400;
                            _robotsReturned403 = domainInfo._robotsReturned403;
                            _ruleSet = ruleSet;
                            _robotsRetrieved = true;

                            if (Environment.detailLogEnabled())
                                LOG.info(
                                        "### Skipping Robots Fetch. Cached CRC is Non-Zero and cached rule-set found via host.");

                            return false;
                        }
                    }
                }
            }
        }
        return true;
    }

    public void setActiveDomainName(String hostName) {
        _activeDomainInfo = getDomainInfoFromDomain(hostName);
    }

    public DomainInfo getActiveDomain() {
        return _activeDomainInfo;
    }

    private DomainInfo getDomainInfoFromDomain(String domainName) {
        DomainInfo oldestItem = null;
        DomainInfo found = null;
        for (DomainInfo item : _domainInfo) {
            if (item._domainName.equals(domainName)) {
                if (getServerSingleton() != null) {
                    if (item._lastTouched < getServerSingleton().getFilterUpdateTime()) {
                        if (CrawlerServer.getEngine() != null) {
                            item._domainBlackListed = CrawlerServer.getEngine().isBlackListedHost(domainName);
                        } else {
                            item._domainBlackListed = false;
                        }
                    }
                }
                item._lastTouched = System.currentTimeMillis();
                found = item;
            }
            oldestItem = (oldestItem == null) ? item
                    : (oldestItem._lastTouched > item._lastTouched) ? item : oldestItem;
        }

        if (found == null) {
            if (_domainInfo.size() == MAX_DOMAIN_CACHE_ENTIRES) {
                _domainInfo.removeElement(oldestItem);
            }
            found = new DomainInfo();

            found._domainName = domainName;
            found._lastTouched = System.currentTimeMillis();
            if (getServerSingleton() != null) {
                if (CrawlerServer.getEngine() != null) {
                    found._domainBlackListed = CrawlerServer.getEngine().isBlackListedHost(domainName);
                } else {
                    found._domainBlackListed = false;
                }
            }
            _domainInfo.addHead(found);
        }
        return found;
    }

    private long checkDomainCacheForRobotsCRC(String hostName) {
        for (DomainInfo aliasInfo : _domainInfo) {
            if (aliasInfo._domainName.equalsIgnoreCase(hostName)) {
                aliasInfo._lastTouched = System.currentTimeMillis();
                _domainInfo.removeElement(aliasInfo);
                _domainInfo.addHead(aliasInfo);
                if (aliasInfo._robotsCRC != -1) {
                    if (Environment.detailLogEnabled())
                        LOG.info("### Found Robots Match in Cache for host:" + hostName);
                }
                return aliasInfo._robotsCRC;
            }
        }
        return -1;
    }

    private void updateRobotsCRCForDomain(long crc, String domainName, boolean robotsReturned400,
            boolean robotsReturned403) {
        getDomainInfoFromDomain(domainName)._robotsCRC = crc;
        getDomainInfoFromDomain(domainName)._robotsReturned400 = robotsReturned400;
        getDomainInfoFromDomain(domainName)._robotsReturned403 = robotsReturned403;
    }

    private void resetRobotsState() {
        // flip robots status ... 
        _robotsRetrieved = false;
        _robotsReturned400 = false;
        _robotsReturned403 = false;
        _robotsHostName = null;
        _robotsCRC = 0;
        _ruleSet = RobotRulesParser.getEmptyRules();
    }

    private CrawlTarget buildRobotsRequest(String hostName) {

        CrawlTarget targetOut = null;

        // reset the robots state ... 
        resetRobotsState();

        // log the situation 
        // LOG.info("####Robots-fetching robots for host:" + hostName);

        // and set up some initial robots state 
        _robotsHostName = hostName;

        //build a robots.txt url
        URL robotsURL = null;
        try {
            robotsURL = new URL(getHost().getScheme(), _robotsHostName, "/robots.txt");
        } catch (MalformedURLException e) {

        }

        if (robotsURL == null) {
            if (Environment.detailLogEnabled())
                LOG.error("####Robots Unable to fetch Robots for host:" + _robotsHostName);
            // cheat 
            _robotsRetrieved = true;
            // and update the robot info in the alias map 
            updateRobotsCRCForDomain(_robotsCRC, _robotsHostName, _robotsReturned400, _robotsReturned403);
        } else {
            // ok , the robots url is good 
            targetOut = new CrawlTarget(0, this);
            // set the url ... 
            targetOut.setOriginalURL(robotsURL.toString());
            // and mark the target as a robots get 
            targetOut.setFlags(CrawlURL.Flags.IsRobotsURL);

            CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();

            synchronized (crawlerStats) {
                crawlerStats.setActvieRobotsRequests(crawlerStats.getActvieRobotsRequests() + 1);
            }

        }
        return targetOut;
    }

    public boolean populateIPAddressForTarget(String hostName, CrawlTarget target) {
        for (DNSCacheItem item : _dnsCacheItem) {
            if (item._hostName.equalsIgnoreCase(hostName)) {
                if (item._ttl >= System.currentTimeMillis()) {
                    //LOG.info("###Using Cached IP Address for target:" + target.getActiveURL() + " Cached IP:" + item._ipAddress + " TTL:" + item._ttl);
                    target.setServerIP(item._ipAddress);
                    target.setServerIPTTL(item._ttl);
                    return true;
                }
                return false;
            }
        }
        return false;
    }

    private void applyRewriteRulesToTarget(String hostName, CrawlTarget target) {

        if (_rewriteItemList != null) {
            for (WWWReWriteItem item : _rewriteItemList) {
                if (item._wwwRuleDomain.equalsIgnoreCase(hostName)) {
                    if ((item._wwwRuleType & WWWRULE_Add) != 0) {
                        target.setOriginalURL(target.getOriginalURL().replaceFirst(hostName, "www." + hostName));
                    } else {
                        target.setOriginalURL(
                                target.getOriginalURL().replaceFirst(hostName, hostName.substring(4)));
                    }
                    break;
                }
            }
        }
    }

    static CrawlURLMetadata rewriteTestMetadata = new CrawlURLMetadata();
    static FilterResults rewriteFilterResults = new FilterResults();

    /** get next crawl candidate */
    public synchronized CrawlTarget getNextTarget() {

        if (_scheduled != null) {
            throw new RuntimeException("Scheduled Not Null and getNextTarget called!");
        }

        int maxRobotsExclusionInLoop = (CrawlerServer.getServer() != null)
                ? CrawlerServer.getServer().getMaxRobotsExlusionsInLoopOverride()
                : -1;
        if (maxRobotsExclusionInLoop == -1) {
            maxRobotsExclusionInLoop = MAX_ROBOTS_EXCLUSION_IN_LOOP;
        }

        // target out is currently null
        CrawlTarget targetOut = null;

        Disposition oldDisposition = _disposition;

        int robotsExcludedCount = 0;
        int failedTargetsCount = 0;

        String domainName = "";

        while (targetOut == null && getNextPending(false) != null
                && getDisposition() == CrawlList.Disposition.ItemAvailable) {

            // pop the next target off of the queue ... 
            CrawlTarget potentialTarget = getNextPending(true);

            // mark request start time 
            potentialTarget.setRequestStartTime(System.currentTimeMillis());

            // get the host name (fast)
            domainName = URLUtils.fastGetHostFromURL(potentialTarget.getActiveURL());

            // if not valid ... fail explicitly 
            if (domainName == null || domainName.length() == 0) {
                // explicitly fail this url ... 
                CrawlTarget.failURL(
                        potentialTarget.createFailureCrawlURLObject(CrawlURL.FailureReason.MalformedURL, null),
                        potentialTarget, CrawlURL.FailureReason.MalformedURL, null);
            } else {

                /*
                // potentially rewrite domain name 
                if (getServer().getDNSRewriteFilter() != null) { 
                  synchronized (rewriteTestMetadata) { 
                    if (getServer().getDNSRewriteFilter().filterItem(domainName, "", rewriteTestMetadata, rewriteFilterResults) == FilterResult.Filter_Modified) {
                      LOG.info("### FILTER Rewrote DomainName:" + domainName + " To:" + rewriteFilterResults.getRewrittenDomainName());
                      domainName = rewriteFilterResults.getRewrittenDomainName();
                    }
                  }
                }
                */

                // set the active host name 
                setActiveDomainName(domainName);

                // check to see if the domain has been marked as failed or the host has been marked as failed ...
                if (!getActiveDomain()._domainFailed && !getActiveDomain()._domainBlackListed
                        && !_host.isFailedServer()) {

                    // if the the active target does not match the current robots file ...  
                    if (activeDomainRequiresRobotsFetch(domainName)) {

                        // add the target back to the head of the queue... 
                        _pending.addHead(potentialTarget);

                        // and build a robots request ... 
                        targetOut = buildRobotsRequest(domainName);

                    }
                    // otherwise ... go ahead try to fetch the next url in the queue 
                    else {

                        // now if disposition is still item available ... 
                        if (potentialTarget != null && _disposition == Disposition.ItemAvailable) {

                            targetOut = potentialTarget;

                            URL theTargetURL = null;

                            try {
                                theTargetURL = new URL(targetOut.getOriginalURL());
                            } catch (MalformedURLException e) {
                                theTargetURL = null;
                                LOG.error("Error parsing URL:" + targetOut.getOriginalURL() + " for Host:"
                                        + domainName);
                            }

                            if (theTargetURL == null) {
                                // explicitly fail this url ... 
                                CrawlTarget
                                        .failURL(
                                                targetOut.createFailureCrawlURLObject(
                                                        CrawlURL.FailureReason.MalformedURL, null),
                                                targetOut, CrawlURL.FailureReason.MalformedURL, null);

                                // and set target out to null!!
                                targetOut = null;
                            } else {

                                boolean robotsExcluded = !_ruleSet.isAllowed(theTargetURL);
                                boolean serverExcluded = false;
                                if (!robotsExcluded) {
                                    serverExcluded = CrawlerServer.getServer().isURLInBlockList(theTargetURL);
                                }

                                // validate against the robots file ...
                                if (robotsExcluded || serverExcluded) {
                                    //track number of robots exclusion in this loop 
                                    ++robotsExcludedCount;
                                    // inform host 
                                    _host.incrementCounter(CrawlListHost.CounterId.RobotsExcludedCount, 1);
                                    // explicitly fail this url ...
                                    if (robotsExcluded) {
                                        if (Environment.detailLogEnabled())
                                            LOG.info(
                                                    "### ROBOTS Excluded URL:" + theTargetURL + " via Robots File");
                                        CrawlTarget.failURL(
                                                targetOut.createFailureCrawlURLObject(
                                                        CrawlURL.FailureReason.RobotsExcluded, null),
                                                targetOut, CrawlURL.FailureReason.RobotsExcluded, null);
                                    } else {
                                        if (Environment.detailLogEnabled())
                                            LOG.info("### ROBOTS Excluded URL:" + theTargetURL + " via Blacklist");
                                        CrawlTarget.failURL(
                                                targetOut.createFailureCrawlURLObject(
                                                        CrawlURL.FailureReason.BlackListedURL, null),
                                                targetOut, CrawlURL.FailureReason.BlackListedURL, null);
                                    }

                                    // and set target out to null
                                    targetOut = null;

                                    // if robots processed in loop exceeds maximum 
                                    if (robotsExcludedCount >= maxRobotsExclusionInLoop) {

                                        if (_pending.size() != 0 || _offlineTargetCount != 0) {
                                            // wait on time ...
                                            _disposition = Disposition.WaitingOnTime;
                                        } else {
                                            _disposition = Disposition.QueueEmpty;
                                        }
                                        // and break out
                                        break;
                                    }
                                }
                                //
                            }
                        }
                    }
                }
                // otherwise ... if the domain has failed ... 
                else {
                    if (potentialTarget != null) {
                        int failureReason = CrawlURL.FailureReason.TooManyErrors;
                        String failureDesc = "Host Failed due to too many errors";
                        if (getActiveDomain()._domainBlackListed) {
                            failureReason = CrawlURL.FailureReason.BlackListedURL;
                            failureDesc = "Host Black Listed";
                        } else if (getHost().isBlackListedHost()) {
                            failureReason = CrawlURL.FailureReason.BlackListedHost;
                            failureDesc = "Host Black Host";
                        }

                        // fail the url and move on ...
                        //TODO: DISABLING THIS BECAUSE FAILING FOR ABOVE REASONS IS NOT REALLY A PERSISTENT FAILURE ATTRIBUTABLE TO THE URL 
                        // CrawlTarget.failURL(potentialTarget.createFailureCrawlURLObject(failureReason, failureDesc),potentialTarget, failureReason,null);
                    }
                    // set 
                    targetOut = null;
                    // increment failed item count 
                    failedTargetsCount++;
                    // now, if failed count exceeds max failures in loop 
                    if (failedTargetsCount >= MAX_FAILED_TARGETS_IN_LOOP) {

                        if (_pending.size() != 0 || _offlineTargetCount != 0) {
                            // wait on time ...
                            _disposition = Disposition.WaitingOnTime;
                        } else {
                            _disposition = Disposition.QueueEmpty;
                        }
                        // and break out
                        break;
                    }
                }
            }
        }

        // ok, if we have a target ... fetch it ... 
        if (targetOut != null) {

            // ok before we can fetch this guy, we need to check to see if the associated host is in a paused state ... 
            if (_host.isPaused()) {
                LOG.info("***getNextItem for Host:" + domainName + " is Paused!!");
                // null target out, which will set us in a waiting on time state again 
                targetOut = null;
            }

            // now again, if target out is not null 
            if (targetOut != null) {

                if (Environment.detailLogEnabled())
                    LOG.info("getNextItem for Host:" + domainName + " Returned URL:" + targetOut.getOriginalURL()
                            + " object:" + targetOut.toString());
                // set scheduled item pointer ... 
                _scheduled = targetOut;

                // set the active host name 
                setActiveDomainName(domainName);
                // get ip address info (if available)
                populateIPAddressForTarget(domainName, _scheduled);

                // change disposition ... 
                _disposition = Disposition.WaitingOnCompletion;

                //set initial fetch start time ... 
                updateLastFetchStartTime(System.currentTimeMillis());
            }
        }

        // if target out is null, and pending size is zero but there are offline targets ... 
        if (targetOut == null) {

            if (_pending.size() != 0 || _offlineTargetCount != 0) {
                // then set disposition to waiting on time ... 
                _disposition = CrawlList.Disposition.WaitingOnTime;
            } else {
                _disposition = CrawlList.Disposition.QueueEmpty;
            }
        }

        // check to see if we need to load more items from disk 
        potentiallyQueueDiskLoad();

        if (targetOut != null) {
            // finally rewrite target url if necessary 
            applyRewriteRulesToTarget(domainName, targetOut);
        }

        if (targetOut != null) {
            // LOG.info("### getNextIem for Host:" + domainName + " Returned URL:" + targetOut.getActiveURL());
        }
        return targetOut;
    }

    void fetchStarting(CrawlTarget target, NIOHttpConnection connection) {
        _activeConnection = connection;
    }

    /** fetch started callback - called from CrawlTarget **/
    void fetchStarted(CrawlTarget target) {

        // record fetch start time ...  
        updateLastFetchStartTime(System.currentTimeMillis());

        // and notify host as well 
        _host.updateLastFetchStartTime(getLastFetchStartTime());

        if (_scheduled == target) {

            if (Environment.detailLogEnabled())
                LOG.info("Fetch Started URL:" + target.getOriginalURL());
        } else {
            if (_scheduled == null) {
                LOG.error("fetchStarted - scheduled target is null and fetch started target is:"
                        + target.getOriginalURL().toString() + " list:" + target.getSourceList().getListName());
            } else {
                LOG.error("fetchStarted - scheduled target is: " + _scheduled.getOriginalURL().toString() + " list:"
                        + _scheduled.getSourceList().getListName() + " and fetch started target is:"
                        + target.getOriginalURL().toString() + " list:" + target.getSourceList().getListName());
            }
        }

    }

    /** if in memory queue is exhausted or below threshold and there offline targets, queue up a load from disk for this domain **/
    private void potentiallyQueueDiskLoad() {
        if (_pending.size() <= DISK_LOAD_THRESHOLD && (!_diskRequestPending || _pending.size() == 0)
                && _offlineTargetCount != 0) {
            _diskRequestPending = true;
            _diskOperationQueue.add(new DiskQueueEntry(this, true));
        }
    }

    static class RobotRuleResult {
        public RobotRuleSet ruleSet;
        public long crcValue;
    };

    /** fetch succeeded **/
    void fetchSucceeded(final CrawlTarget target, int downloadTime, final NIOHttpHeaders httpHeaders,
            final Buffer contentBuffer) {

        _lastRequestWasIOException = false;
        _lastRequestDownloadTime = downloadTime;
        _lastRequestRedirectCount = target.getRedirectCount();
        _fetchEndTime = System.currentTimeMillis();

        _activeConnection = null;

        getHost().incrementCounter(CrawlListHost.CounterId.SuccessfullGetCount, 1);

        // reset host's io error count
        _host.resetCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount);

        if (getActiveDomain() != null)
            getActiveDomain()._domainRetryCounter = 0;

        Disposition oldDisposition = _disposition;

        final String originalHost = URLUtils.fastGetHostFromURL(target.getOriginalURL());
        final String activeHost = URLUtils.fastGetHostFromURL(target.getActiveURL());

        if (originalHost != null && activeHost != null) {
            // update our server ip information from information contained within crawl target ...
            cacheDNSEntry(activeHost, target.getServerIP(), target.getServerIPTTL());
            // if the target was redirected ... cache the original ip address and ttl as well ... 
            if (target.isRedirected()) {
                if (target.getOriginalRequestData() != null) {
                    cacheDNSEntry(originalHost, target.getOriginalRequestData()._serverIP,
                            target.getOriginalRequestData()._serverIPTTL);
                }
            }
        }

        final int resultCode = NIOHttpConnection.getHttpResponseCode(httpHeaders);

        if (resultCode == 200) {

            getHost().incrementCounter(CrawlListHost.CounterId.Http200Count, 1);

            if (getActiveDomain() != null) {
                getActiveDomain()._HTTP200Count++;
                getActiveDomain()._SequentialHTTPFailuresCount = 0;
            }

            // validate www rewrite rule if not set and target was redirected ... 
            if (target.isRedirected()) {
                /* this is broken for the new list design
                if (!originalHost.equalsIgnoreCase(activeHost)) { 
                  // if redirect strips the www then ... 
                  if ((originalHost.startsWith("www.") || originalHost.startsWith("WWW.")) && activeHost.equalsIgnoreCase(originalHost.substring(4))) { 
                    addWWWReWriteItem(originalHost,WWWRULE_Remove);
                  }
                  // else if redirect adds the www then ...
                  else if ((activeHost.startsWith("www.") || activeHost.startsWith("WWW.")) && originalHost.equalsIgnoreCase(activeHost.substring(4))) {
                    addWWWReWriteItem(originalHost,WWWRULE_Add);
                  }
                }
                */
            }
        } else if (resultCode >= 400 && resultCode < 500) {
            if (resultCode == 403) {
                // inform host for stats tracking purposes 
                _host.incrementCounter(CrawlListHost.CounterId.Http403Count, 1);
            }
            if (getActiveDomain() != null)
                getActiveDomain()._SequentialHTTPFailuresCount++;
        } else if (resultCode >= 500 && resultCode < 600) {
            if (getActiveDomain() != null) {
                getActiveDomain()._SequentialHTTPFailuresCount++;
            }
        }

        if (_scheduled != target) {
            if (_scheduled == null)
                LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName()
                        + " fetchSucceed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!");
            else
                LOG.error("List:" + getHost().getIPAddressAsString() + " List:" + getListName()
                        + " fetchSucceed Target is:" + target.getOriginalURL() + " " + target.toString()
                        + " ActiveTarget is:" + _scheduled.getOriginalURL() + " " + _scheduled.toString());
        } else {

            // clear active ... 
            _scheduled = null;

            // if this is the robots target ... 
            if ((target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) {

                final CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();

                // process the robots data if any ... 
                // check for null queue (in case of unit test);
                if (resultCode == 200) {

                    _robotsRetrieved = true;

                    synchronized (crawlerStats) {
                        crawlerStats.setRobotsRequestsSucceeded(crawlerStats.getRobotsRequestsSucceeded() + 1);
                        crawlerStats.setRobotsRequestsQueuedForParse(
                                crawlerStats.getRobotsRequestsQueuedForParse() + 1);
                    }

                    LOG.info("### Scheduling Robots Parse for:" + target.getActiveURL());

                    // transition to a waiting on completion disposition ... 
                    _disposition = Disposition.WaitingOnCompletion;

                    if (getServerSingleton() != null) {
                        // schedule a robots parser parse attempt ... 
                        getServerSingleton().registerThreadPool("robots", 5)
                                .execute(new ConcurrentTask<RobotRuleResult>(getServerSingleton().getEventLoop(),

                                        new Callable<RobotRuleResult>() {

                                            public RobotRuleResult call() throws Exception {

                                                try {

                                                    TextBytes contentData = new TextBytes(contentBuffer.get());

                                                    String contentEncoding = httpHeaders
                                                            .findValue("Content-Encoding");

                                                    if (contentEncoding != null
                                                            && contentEncoding.equalsIgnoreCase("gzip")) {

                                                        if (Environment.detailLogEnabled())
                                                            LOG.info("GZIP Encoding Detected for Robots File For:"
                                                                    + activeHost);

                                                        UnzipResult result = GZIPUtils.unzipBestEffort(
                                                                contentData.getBytes(),
                                                                CrawlEnvironment.CONTENT_SIZE_LIMIT);

                                                        if (result == null) {
                                                            contentData = null;
                                                            if (Environment.detailLogEnabled())
                                                                LOG.info(
                                                                        "GZIP Decoder returned NULL for Robots File For:"
                                                                                + activeHost);
                                                        } else {
                                                            contentData.set(result.data.get(),
                                                                    result.data.getOffset(),
                                                                    result.data.getCount());
                                                        }
                                                    }

                                                    try {
                                                        if (contentData != null) {
                                                            String robotsTxt = contentData.toString().trim()
                                                                    .toLowerCase();
                                                            if (robotsTxt.startsWith("<html")
                                                                    || robotsTxt.startsWith("<!doctype html")) {
                                                                contentData = null;

                                                                CrawlerServer.getEngine().logRobots(
                                                                        System.currentTimeMillis(), _robotsHostName,
                                                                        resultCode, null,
                                                                        CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                        CrawlerEngine.RobotsParseFlag_ContentWasHTML);
                                                            } else {
                                                                CrawlerServer.getEngine().logRobots(
                                                                        System.currentTimeMillis(), _robotsHostName,
                                                                        resultCode, robotsTxt,
                                                                        CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                        0);

                                                                synchronized (this) {
                                                                    _lastFetchedRobotsData = robotsTxt;
                                                                    _lastFetchedRobotsHostName = _robotsHostName;
                                                                }
                                                            }
                                                        } else {
                                                            CrawlerServer.getEngine().logRobots(
                                                                    System.currentTimeMillis(), _robotsHostName,
                                                                    resultCode, null,
                                                                    CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                    CrawlerEngine.RobotsParseFlag_ContentDecodeFailed);
                                                        }
                                                    } catch (Exception e) {
                                                        LOG.error(CCStringUtils.stringifyException(e));
                                                        CrawlerServer.getEngine().logRobots(
                                                                System.currentTimeMillis(), _robotsHostName,
                                                                resultCode, null,
                                                                CrawlerEngine.RobotsLogEventType.HTTP_GET_Complete,
                                                                CrawlerEngine.RobotsParseFlag_ContentDecodeFailed);
                                                    }

                                                    if (Environment.detailLogEnabled())
                                                        LOG.info("Parsing Robots File for Host:" + activeHost);
                                                    RobotRuleResult result = new RobotRuleResult();

                                                    if (contentData != null) {
                                                        synchronized (_crc32) {
                                                            _crc32.reset();
                                                            _crc32.update(contentData.getBytes(),
                                                                    contentData.getOffset(),
                                                                    contentData.getLength());
                                                            result.crcValue = _crc32.getValue();
                                                        }
                                                        RobotRulesParser parser = new RobotRulesParser(
                                                                getServerSingleton().getConfig());
                                                        result.ruleSet = parser.parseRules(contentData.getBytes(),
                                                                contentData.getOffset(), contentData.getLength());
                                                    } else {
                                                        result.ruleSet = RobotRulesParser.getEmptyRules();
                                                        result.crcValue = 0;
                                                    }
                                                    return result;
                                                } catch (Exception e) {
                                                    LOG.error(CCStringUtils.stringifyException(e));
                                                    throw e;
                                                }
                                            }
                                        },

                                        new ConcurrentTask.CompletionCallback<RobotRuleResult>() {

                                            public void taskComplete(RobotRuleResult loadResult) {

                                                synchronized (crawlerStats) {
                                                    crawlerStats.setRobotsRequestsQueuedForParse(
                                                            crawlerStats.getRobotsRequestsQueuedForParse() - 1);
                                                }

                                                if (loadResult != null) {

                                                    boolean disallowsAll = !_ruleSet.isAllowed("/");
                                                    boolean robotsHadCrawlDelay = _ruleSet.getCrawlDelay() != -1;
                                                    boolean explicitMention = _ruleSet.explicitMention;

                                                    int logFlags = 0;
                                                    if (disallowsAll)
                                                        logFlags |= CrawlerEngine.RobotsParseFlag_ExcludesAll;
                                                    if (explicitMention)
                                                        logFlags |= CrawlerEngine.RobotsParseFlag_ExplicitMention;
                                                    if (robotsHadCrawlDelay)
                                                        logFlags |= CrawlerEngine.RobotsParseFlag_HasCrawlDelay;

                                                    synchronized (crawlerStats) {
                                                        crawlerStats.setRobotsRequestsSuccessfullParse(
                                                                crawlerStats.getRobotsRequestsSuccessfullParse()
                                                                        + 1);
                                                        if (disallowsAll) {
                                                            crawlerStats.setRobotsFileExcludesAllContent(
                                                                    crawlerStats.getRobotsFileExcludesAllContent()
                                                                            + 1);
                                                            if (explicitMention)
                                                                crawlerStats.setRobotsFileExplicitlyExcludesAll(
                                                                        crawlerStats
                                                                                .getRobotsFileExplicitlyExcludesAll()
                                                                                + 1);
                                                        }
                                                        if (explicitMention) {
                                                            crawlerStats.setRobotsFileHasExplicitMention(
                                                                    crawlerStats.getRobotsFileHasExplicitMention()
                                                                            + 1);
                                                        }
                                                        if (robotsHadCrawlDelay) {
                                                            crawlerStats.setRobotsFileHadCrawlDelay(
                                                                    crawlerStats.getRobotsFileHadCrawlDelay() + 1);
                                                        }
                                                    }

                                                    CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),
                                                            _robotsHostName, 0, null,
                                                            CrawlerEngine.RobotsLogEventType.Parse_Succeeded,
                                                            logFlags);

                                                    _ruleSet = loadResult.ruleSet;
                                                    _robotsCRC = loadResult.crcValue;

                                                    _host.cacheRobotsFile(_ruleSet, _robotsCRC);
                                                } else {

                                                    CrawlerServer.getEngine().logRobots(System.currentTimeMillis(),
                                                            _robotsHostName, 0, null,
                                                            CrawlerEngine.RobotsLogEventType.Parse_Failed, 0);

                                                    synchronized (crawlerStats) {
                                                        crawlerStats.setRobotsRequestsFailedParse(
                                                                crawlerStats.getRobotsRequestsFailedParse() + 1);
                                                    }

                                                    // LOG.error("####Robots parsing for host:" + activeHost + " failed.");
                                                    _ruleSet = RobotRulesParser.getEmptyRules();
                                                    _robotsCRC = 0;
                                                }

                                                //if (Environment.detailLogEnabled())
                                                LOG.info("####Robots RETRIEVED for Host:" + activeHost
                                                        + " CrawlDelay IS:" + getCrawlDelay(false));

                                                if (originalHost != null && activeHost != null) {
                                                    updateRobotsCRCForDomain(_robotsCRC, originalHost,
                                                            _robotsReturned400, _robotsReturned403);
                                                    if (activeHost.compareToIgnoreCase(originalHost) != 0) {
                                                        updateRobotsCRCForDomain(_robotsCRC, activeHost,
                                                                _robotsReturned400, _robotsReturned403);
                                                    }
                                                }

                                                Disposition oldDisposition = _disposition;

                                                if (getNextPending(false) != null) {
                                                    _disposition = Disposition.ItemAvailable;
                                                } else {
                                                    _disposition = Disposition.WaitingOnTime;
                                                }

                                                if (oldDisposition != _disposition) {
                                                    // notify queue 
                                                    getHost().listDispositionChanged(CrawlList.this, oldDisposition,
                                                            _disposition);
                                                }
                                            }

                                            public void taskFailed(Exception e) {
                                                if (Environment.detailLogEnabled())
                                                    LOG.error("####Robots parsing for host:" + _robotsHostName
                                                            + " failed with exception" + e);
                                                _ruleSet = RobotRulesParser.getEmptyRules();

                                                Disposition oldDisposition = _disposition;

                                                if (getNextPending(false) != null) {
                                                    _disposition = Disposition.ItemAvailable;
                                                } else {
                                                    _disposition = Disposition.WaitingOnTime;
                                                }

                                                if (oldDisposition != _disposition) {
                                                    // notify queue 
                                                    getHost().listDispositionChanged(CrawlList.this, oldDisposition,
                                                            _disposition);
                                                }
                                            }
                                        }));
                    }
                    // explitly return here ( inorder to wait for the async completion event)
                    return;
                }
                //otherwise ... 
                else {

                    synchronized (crawlerStats) {
                        crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1);
                    }

                    CrawlerServer.getEngine().logRobots(System.currentTimeMillis(), _robotsHostName, resultCode,
                            null, CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed, 0);

                    _robotsCRC = 0;
                    if (Environment.detailLogEnabled())
                        LOG.info("####Robots GET for Host:" + activeHost + "FAILED With Result Code:" + resultCode);
                    //TODO: MAKE THIS MORE ROBUST ... 
                    // clear robots flag ... 
                    _robotsRetrieved = true;
                    // see if result code was a 403 
                    if (resultCode >= 400 && resultCode <= 499) {
                        _robotsReturned400 = true;
                        if (resultCode == 403)
                            _robotsReturned403 = true;

                    }
                    // for now, assume no robots rules for any error conditions ... 
                    _ruleSet = RobotRulesParser.getEmptyRules();

                    if (originalHost != null && activeHost != null) {
                        updateRobotsCRCForDomain(_robotsCRC, originalHost, _robotsReturned400, _robotsReturned403);
                        if (activeHost.compareToIgnoreCase(originalHost) != 0) {
                            updateRobotsCRCForDomain(_robotsCRC, activeHost, _robotsReturned400,
                                    _robotsReturned403);
                        }
                    }

                }
            }

            if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) {
                // update active host stats and check for failure ... 
                checkActiveHostStatsForFailure();
            }

            // if there are no more items in the queue 
            if (getNextPending(false) == null) {
                // if offline count is zero then mark this domain's queue as empty
                if (_offlineTargetCount == 0) {
                    _disposition = Disposition.QueueEmpty;
                }
                // otherwise put us in a wait state and potentially queue up a disk load 
                else {
                    _disposition = Disposition.WaitingOnTime;
                    // potentially queue up a disk load 
                    potentiallyQueueDiskLoad();
                }
            } else {
                // if we are ready to fetch the next item ... 
                if (calculateNextWaitTime() < System.currentTimeMillis()) {
                    _disposition = Disposition.ItemAvailable;
                } else {
                    // transition to a new wait state ... 
                    _disposition = Disposition.WaitingOnTime;
                }
            }

            if (oldDisposition != _disposition) {
                // either way ... notify queue 
                getHost().listDispositionChanged(this, oldDisposition, _disposition);
            }
        }
    }

    /** get total request count **/
    private final int getTotalFailureCount() {
        if (getActiveDomain() != null)
            return getActiveDomain()._HTTP400Count + getActiveDomain()._HTTP500Count;
        return 0;
    }

    /** check to see if we should fail the host based on collected stats **/
    private boolean checkActiveHostStatsForFailure() {

        boolean failHost = false;

        if (getActiveDomain() != null) {
            String errorReason = null;

            /*
            if (getActiveDomain()._SequentialHTTPFailuresCount >= SEQUENTIAL_FAILURES_ON_403_ROBOTS_TRIGGER && _robotsReturned403) { 
              errorReason  ="Too Many Sequential Errors AFTER Robots Returned 403. HTTP200 Count:" + getActiveDomain()._HTTP200Count;
              failHost =true;
            }
            */
            if ((getActiveDomain()._HTTP200Count == 0
                    && getActiveDomain()._SequentialHTTPFailuresCount >= SEQUENTIAL_FAILURES_NO_200_TRIGGER)) {
                errorReason = "Too Many Sequential Errors. RobotsReturned400:" + _robotsReturned400 + " 400 Count:"
                        + getActiveDomain()._HTTP400Count + " 500 Count:" + getActiveDomain()._HTTP500Count
                        + " 200 Count:" + getActiveDomain()._HTTP200Count;
                failHost = true;
            } else {

                int totalFailureCount = getTotalFailureCount();
                int totalRequestCount = totalFailureCount + getActiveDomain()._HTTP200Count;

                if (totalRequestCount != 0 && totalRequestCount >= STATS_CHECK_CODE_SAMPLE_THRESHOLD) {

                    float badToGoodPercent = (float) totalFailureCount / (float) totalRequestCount;
                    if (badToGoodPercent >= BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD) {
                        failHost = true;
                        errorReason = "Bad To Good URL Pct:" + badToGoodPercent + " exceeded Threshold:"
                                + BAD_URL_TO_TOTAL_URL_FAILURE_THRESHOLD + " RobotsReturned400:"
                                + _robotsReturned400 + " 400 Count:" + getActiveDomain()._HTTP400Count
                                + " 500 Count:" + getActiveDomain()._HTTP500Count + " 200 Count:"
                                + getActiveDomain()._HTTP200Count;
                    }
                }
            }

            if (failHost) {
                failActiveDomain(CrawlURL.FailureReason.TooManyErrors, errorReason);
                LOG.error("#### HOST FAILURE - List:" + getListName() + "Host: " + getActiveDomain()._domainName
                        + " Reason:" + errorReason);
            }
        }
        return failHost;
    }

    private static final int FAIL_STRATEGY_RETRY_ITEM = 0; // increment the failure count on the item and retry 
    private static final int FAIL_STRATEGY_RETRY_HOST = 1; //  increment the failure count on the host and retry ... 
    private static final int FAIL_STRATEGY_FAIL_ITEM = 2; //  immediately fail the item ... 
    //  private static final int FAIL_STRATEGY_FAIL_HOST  = 3;    // immediately fail the host ... 

    private static final int failureCodeStrategyTable[] = {

            FAIL_STRATEGY_RETRY_ITEM, // UNKNOWN            - result: Inc Fail Count on Item, potentially reschedule 
            FAIL_STRATEGY_FAIL_ITEM, // UknownProtocol  - result: Immediately Fail Item  
            FAIL_STRATEGY_FAIL_ITEM, // MalformedURL     - result: Immediately Fail Item  
            FAIL_STRATEGY_RETRY_ITEM, // Timeout                - result: Inc Fail Count on Host, potentially reschedule
            FAIL_STRATEGY_FAIL_ITEM, // DNSFailure            -result: reschedule, set waitstate for Host
            FAIL_STRATEGY_RETRY_HOST, // ResolverFailure     -result: Inc Fail Count on Host, potentially reschedule
            FAIL_STRATEGY_RETRY_ITEM, // IOException          -result: Inc Fail Count on Item, potentially reschedule 
            FAIL_STRATEGY_FAIL_ITEM, // RobotsExcluded
            FAIL_STRATEGY_FAIL_ITEM, // NoData            = 9;
            FAIL_STRATEGY_RETRY_ITEM, // RobotsParseError  = 10;
            FAIL_STRATEGY_FAIL_ITEM, // RedirectFailed    = 11;
            FAIL_STRATEGY_RETRY_ITEM, // RuntimeError      = 12;
            FAIL_STRATEGY_RETRY_HOST, // ConnectTimeout    = 13;
            FAIL_STRATEGY_FAIL_ITEM, //BlackListedHost   = 14;
            FAIL_STRATEGY_FAIL_ITEM, //BlackListedURL    = 15;
            FAIL_STRATEGY_FAIL_ITEM, //TooManyErrors     = 16;
            FAIL_STRATEGY_FAIL_ITEM, //InCache           = 17;
            FAIL_STRATEGY_FAIL_ITEM// InvalidResponseCode = 18;

    };

    private void failURL(CrawlTarget target, int failureReason, String errorDescription) {

        // explicitly fail the item ... 
        CrawlTarget.failURL(target.createFailureCrawlURLObject(failureReason, errorDescription), target,
                failureReason, errorDescription);
    }

    private synchronized void failActiveDomain(int failureReason, String errorDescription) {

        if (getActiveDomain() != null) {

            LOG.error("### Failing Active Domain:" + getActiveDomain()._domainName + " in List:" + getListName()
                    + " ReasonCode:" + failureReason + " Description:" + errorDescription);

            // _disposition = Disposition.QueueEmpty;
            getActiveDomain()._domainFailed = true;

            /*
            // fail scheduled url  ... 
            if (_scheduled != null) { 
              _scheduled = null;
            }
            // just remove all pending urls from list for now ...
            _pending.removeAll();
            // reset offline count... 
            _offlineTargetCount = 0;
            // reset disk operation pending indiciator ... 
            _diskRequestPending = false;
            */
            getHost().incrementCounter(CrawlListHost.CounterId.FailedDomainCount, 1);

            CrawlerServer.getEngine().failDomain(getActiveDomain()._domainName);
        }
    }

    synchronized void fetchFailed(CrawlTarget target, int failureReason, String description) {

        _activeConnection = null;
        _lastRequestRedirectCount = target.getRedirectCount();
        _fetchEndTime = System.currentTimeMillis();

        getHost().incrementCounter(CrawlListHost.CounterId.FailedGetCount, 1);

        if (getActiveDomain() != null) {
            getActiveDomain()._SequentialHTTPFailuresCount++;
        }

        _lastRequestWasIOException = false;

        //check to see if the error is an io exception or a timeout 
        if (failureReason == CrawlURL.FailureReason.IOException
                || failureReason == CrawlURL.FailureReason.Timeout) {
            // increment host failure counter ... 
            _host.incrementCounter(CrawlListHost.CounterId.ConsecutiveIOErrorCount, 1);
            _lastRequestWasIOException = true;
        }

        // the rest is similar to a host retry strategy ... 
        Disposition oldDisposition = _disposition;

        if (_scheduled != target) {
            if (_scheduled == null)
                LOG.error("Host:" + getHost().getIPAddressAsString() + " List:" + getListName()
                        + " fetchFailed Target is:" + target.getOriginalURL() + " ActiveTarget is NULL!");
            else
                LOG.error("Host:" + getHost().getIPAddressAsString() + " List:" + getListName()
                        + " fetchFailed Target is:" + target.getOriginalURL() + " ActiveTarget is:"
                        + _scheduled.getOriginalURL());
        } else {

            // reset active and scheduled ... 
            _scheduled = null;

            // if we failed on the robots get ... 
            if ((target.getFlags() & CrawlURL.Flags.IsRobotsURL) == 1) {

                CrawlerStats crawlerStats = CrawlerServer.getEngine().getCrawlerStats();

                synchronized (crawlerStats) {
                    crawlerStats.setRobotsRequestsFailed(crawlerStats.getRobotsRequestsFailed() + 1);
                }

                //TODO: FIGURE THIS OUT LATER ... FOR NOW .. ON A FAILURE OF ROBOTS.TXT GET, WE ASSUME NO ROBOTS.TXT FILE ... 

                //LOG.warn("Robots Fetch for host:"+getHostName() + " Failed with Reason:" + failureReason +" Desc:" + description);
                //LOG.warn("Assuming NO-ROBOTS FILE");

                CrawlerServer.getEngine().logRobots(System.currentTimeMillis(), _robotsHostName, 0, null,
                        CrawlerEngine.RobotsLogEventType.HTTP_GET_Failed, 0);

                target.logFailure(CrawlerServer.getEngine(), failureReason, description);

                _robotsRetrieved = true;
                _ruleSet = RobotRulesParser.getEmptyRules();

                // and clear scheduled ... 
                _scheduled = null;

                updateLastFetchStartTime(-1);

                // and transition to wait state .... 
                _disposition = Disposition.WaitingOnTime;
            }
            // otherwise pass on to underlying crawl target handler ... 
            else {

                // default failure strategy ... 
                int failureStrategy = FAIL_STRATEGY_RETRY_ITEM;

                // if failure code is within known failure codes ... 
                if (failureReason >= CrawlURL.FailureReason.UNKNOWN
                        && failureReason <= CrawlURL.FailureReason.InvalidResponseCode) {
                    // use table to map strategy ... 
                    failureStrategy = failureCodeStrategyTable[failureReason - 1];
                }

                switch (failureStrategy) {

                case FAIL_STRATEGY_RETRY_HOST:
                case FAIL_STRATEGY_RETRY_ITEM: {

                    // increment retry counter ... 
                    getActiveDomain()._domainRetryCounter++;
                    // increment retry counter on target ... 
                    target.incrementRetryCounter();
                    // IFF server failed ... 
                    // OR retry count on item exceeded ... 
                    // OR this item is a high priority dispatch item ...
                    // THEN immediately fail this item 
                    // ELSE queue up this item for subsequent retry
                    if (_host.isFailedServer() || target.getRetryCount() >= MAX_ITEM_RETRY
                            || ((target.getFlags() & CrawlURL.Flags.IsHighPriorityURL) != 0)) {
                        failURL(target, failureReason, description);
                    } else {
                        // and add it back to the pending list ... 
                        _pending.addTail(target);
                    }
                }
                    break;

                case FAIL_STRATEGY_FAIL_ITEM: {
                    failURL(target, failureReason, description);
                }
                    break;

                /*        
                        case FAIL_STRATEGY_FAIL_HOST: { 
                          // just put the entire host in a fail state ...
                          failDomain(failureReason,description);
                        }break;
                */
                }

                switch (failureStrategy) {

                case FAIL_STRATEGY_RETRY_ITEM:
                case FAIL_STRATEGY_FAIL_ITEM:
                case FAIL_STRATEGY_RETRY_HOST: {

                    // check to see if there are items in the pending queue .... 
                    if (_pending.size() == 0 && _offlineTargetCount == 0) {
                        // if not... transition to Queue Empty 
                        _disposition = Disposition.QueueEmpty;
                    } else {
                        long waitTime = calculateNextWaitTime();
                        // if we can fetch the next item ... 
                        if (waitTime <= System.currentTimeMillis()) {
                            if (_pending.size() != 0)
                                // shift to an available disposition ... 
                                _disposition = Disposition.ItemAvailable;
                            else
                                // shift to waiting on time disposition (to wait for disk queue load).
                                _disposition = Disposition.WaitingOnTime;
                        } else {
                            // wait on time ...
                            _disposition = Disposition.WaitingOnTime;
                        }
                    }
                }
                    break;
                }

                if (description == null)
                    description = "";
                if (Environment.detailLogEnabled())
                    LOG.error("Fetch Failed for URL:" + target.getOriginalURL() + " Reason:" + failureReason
                            + " Description:" + description + " Strategy:" + failureStrategy + " OldDisp:"
                            + oldDisposition + " NewDisp:" + _disposition);
            }

            if (_disposition == Disposition.WaitingOnCompletion) {
                LOG.error("### BUG Fetch Faile for URL:" + target.getOriginalURL()
                        + " failed to transition List to proper disposition!");
            }

            if (getServerSingleton() != null && getServerSingleton().failHostsOnStats()) {
                // update active host stats and check for failure ... 
                checkActiveHostStatsForFailure();
            }

            // notify queue if disposition changed ... 
            if (_disposition != oldDisposition) {
                getHost().listDispositionChanged(this, oldDisposition, _disposition);
            }
        }
    }

    /** clear a pre-existing wait state **/
    synchronized void clearWaitState() {

        Disposition oldDisposition = _disposition;

        // if robots retrieval is pending ... 
        if (_robotsRetrieved == false) {

            // LOG.debug("clearWaitState called on Host:"+getHostName()+ " after initial robots fetch");

            // explicitly transition to availabel (to retry robots fetch... )
            _disposition = Disposition.ItemAvailable;
        }
        // otherwise if pending queue size is zero or host has failed ... 
        else if ((_pending.size() == 0 && _offlineTargetCount == 0 && _queued.size() == 0)) {
            // LOG.debug("clearWaitState called on Host:"+getHostName()+ " and queue is empty. transitioning to QueueEmpty");
            // transition to queue empty disposition
            _disposition = Disposition.QueueEmpty;
        } else {

            // if active request size < max simulatenous requests ... 
            if (_scheduled == null) {
                // if there are items to be read from the in memory list ... 
                if (_pending.size() != 0) {
                    // LOG.error("clearWaitState called Host:"+getHostName()+ " and getNextPendingReturned object. transitioning to ItemAvailable");
                    // immediately transition to an available state ... 
                    _disposition = Disposition.ItemAvailable;
                } else {
                    if (!_diskRequestPending) {
                        _diskRequestPending = true;
                        _diskOperationQueue.add(new DiskQueueEntry(this, true));
                    }
                    _disposition = Disposition.WaitingOnTime;
                }
            }
            // otherwise... we are waiting on completion now ... 
            else {
                LOG.warn("clearWaitState called on already scheduled list:" + getListName());
                _disposition = Disposition.WaitingOnCompletion;
            }
        }

        getHost().listDispositionChanged(this, oldDisposition, _disposition);
    }

    /** calculateRetryWaitTime */
    public long calculateNextWaitTime() {

        // ok check to see if the related host is paused ...
        if (_host.isPaused()) {
            LOG.info("*** host is paused. pausing crawl for: " + PAUSE_STATE_RETRY_DELAY + " milliseconds");
            // ok suspend for pause delay 
            return System.currentTimeMillis() + PAUSE_STATE_RETRY_DELAY;
        }

        if (_fetchStartTime == -1) {
            return System.currentTimeMillis();
        } else {
            // first calculate crawl delay based on robots delay value * number of hops to service last request 
            //int crawlDelay = (getCrawlDelay(true) * (_lastRequestRedirectCount+1));
            int crawlDelay = getCrawlDelay(true);

            // if the crawl delay is the default host crawl delay 
            if (crawlDelay == _host.getCrawlDelay()) {
                // see if fetch time is available 
                int lastDocFetchTime = getLastRequestFetchTime();

                if (lastDocFetchTime != 0) {
                    // calculate alternate crawl delay based on fetch time ... 
                    int alternateCrawlDelay = lastDocFetchTime * 4;
                    if (alternateCrawlDelay > crawlDelay) {
                        crawlDelay = alternateCrawlDelay;
                        if (Environment.detailLogEnabled())
                            LOG.info("### CRAWLDELAY Using Alternate Crawl Delay of:" + alternateCrawlDelay
                                    + " for URL:" + getNextPending(false));
                    }
                }
            }

            /*
            if (_lastRequestDownloadTime != -1) { 
              // next see if host took more than crawl delay millseconds to respond
              if (_lastRequestDownloadTime >= getCrawlDelay()) { 
                // add request time to crawl delay 
                crawlDelay += _lastRequestDownloadTime;
              }
              // add one second for every 2 seconds of request time 
              else { 
                crawlDelay += 1000 * (_lastRequestDownloadTime / 2000);
              }
            }
            */
            // ok ... adjust crawl delay by the number of hops it took to get the result
            if (_fetchStartTime != -1) {
                return _fetchStartTime + crawlDelay;
            }
            return System.currentTimeMillis() + crawlDelay;
        }
    }

    /** getNextPending */
    private synchronized CrawlTarget getNextPending(boolean removeItem) {

        CrawlTarget targetOut = null;

        if (_pending.size() != 0) {
            targetOut = _pending.getHead();
            if (removeItem && targetOut != null) {
                _pending.removeElement(targetOut);
            }
        }
        return targetOut;
    }

    /** indicates if robots file need to be retrieved for the specified host */
    public boolean robotsRetrieved() {
        return _robotsRetrieved;
    }

    /** */
    private final int getCrawlDelay(boolean checkForOverride) {

        if (checkForOverride) {
            CrawlTarget potentialTarget = getNextPending(false);

            if (potentialTarget != null) {

                try {
                    URL targetURL = new URL(potentialTarget.getActiveURL());
                    // validate against the server for crawl delay
                    //LOG.info("Checking Crawl Delay for url:" + targetURL.toString());
                    int overridenCrawlDelay = CrawlerServer.getServer().checkForCrawlRateOverride(targetURL);
                    if (overridenCrawlDelay != -1) {
                        if (Environment.detailLogEnabled())
                            LOG.info("### CRAWLDELAY - Overriding Crawl Delay for URL:" + targetURL + " Delay is:"
                                    + overridenCrawlDelay);
                        return overridenCrawlDelay;
                    }
                } catch (MalformedURLException e) {

                }
            }
        }

        int crawlDelayOut = 0;

        if (_ruleSet == null || _ruleSet.getCrawlDelay() == -1) {
            crawlDelayOut += getHost().getCrawlDelay();
        } else {
            crawlDelayOut += (int) Math.min(_ruleSet.getCrawlDelay(), MAX_CRAWL_DELAY);
            crawlDelayOut = Math.max(MIN_CRAWL_DELAY, crawlDelayOut);

        }

        if (_lastRequestWasIOException) {
            crawlDelayOut += IOEXCEPTION_TIMEOUT_BOOST;
        }

        return crawlDelayOut;
    }

    /** clear the host's state **/
    public synchronized void clear() {
        _pending.removeAll();
        _scheduled = null;
        _diskRequestPending = false;
        _offlineTargetCount = 0;
    }

    public synchronized void dumpDetailsToHTML(StringBuffer sb) {
        //    synchronized (_pending) { 
        sb.append("ListName:" + getListName() + "\n");
        sb.append("RobotsRetrieved:" + _robotsRetrieved + "\n");
        sb.append("Disposition:" + _disposition + "\n");
        sb.append("Scheduled:" + ((_scheduled != null) ? _scheduled.getOriginalURL() : "null") + "\n");
        sb.append("PendingCount:" + _pending.size() + "\n");
        sb.append("QueuedCount:" + _queued.size() + "\n");
        sb.append("OfflineCount:" + _offlineTargetCount + "\n");
        sb.append("ActiveConnection:" + _activeConnection + "\n");
        sb.append("LastFetchedRobotsHost:" + _lastFetchedRobotsHostName + "\n");

        if (_pending.size() != 0) {

            sb.append("next 100 scheduled urls:\n");

            int itemCount = 0;
            CrawlTarget target = _pending.getHead();
            while (target != null) {

                sb.append("[" + (itemCount++) + "]:<a href='" + target.getOriginalURL() + "'>"
                        + target.getOriginalURL() + "</a>\n");

                target = target.getNext();
            }
        }

        sb.append("\n\nLastFetchedRobotsData:\n\n");
        if (_lastFetchedRobotsData != null) {
            sb.append(_lastFetchedRobotsData);
            sb.append("\n");
        }

        //    }
    }

    /**************
     * Disk Operation Support 
     */

    public static int getPendingDiskOperationCount() {
        return _diskOperationQueue.size();
    }

    public static void stopDiskQueueingThread() {

        if (_diskOperationThread != null) {
            _diskOpThreadShuttingDown = true;
            LOG.info("shutting down Disk Queue Thread - sending null item to queue");
            _diskOperationQueue.add(new DiskQueueEntry(null, false));
            try {
                LOG.info("Waiting for Disk Queue Thread to Die");
                _diskOperationThread.join();
                LOG.info("Done Waiting for Disk Queue Thread");
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

            _diskOpThreadShuttingDown = false;
            _diskOperationThread = null;
        }
    }

    public static void startDiskQueueingThread(final EventLoop serverEventLoop, final File baseStoragePath) {

        // figure out 

        // and finally start the blocking writer thread ...
        _diskOperationThread = new Thread(new Runnable() {

            public void run() {

                for (;;) {
                    try {

                        DiskQueueEntry entry = _diskOperationQueue.take();

                        // if buffer item is null... this is considered an eof condition ... break out ... 
                        if (entry.getListItem() == null) {
                            LOG.info(
                                    "### DiskThread:Received Null Item ... Shutting down CrawlDomain Disk Queue Thread");
                            // now matter what ... break out ... 
                            break;
                        }
                        // otherwise .. figure out what to do with the domain ... 
                        else {

                            if (_diskOpThreadShuttingDown == false) {

                                final CrawlList domain = entry.getListItem();

                                try {
                                    if (Environment.detailLogEnabled())
                                        LOG.info("### DiskThread: Got List:" + domain.getListName());
                                    // build a hierarchichal path for the given domain id ... 
                                    File logFilePath = null;
                                    String listName = null;
                                    synchronized (domain) {
                                        logFilePath = FileUtils.buildHierarchicalPathForId(baseStoragePath,
                                                domain.getUniqueListId());
                                        listName = domain.getListName();
                                    }
                                    // get the immediate parent directory ... 
                                    File parentDirectory = logFilePath.getParentFile();
                                    // and recursively create the directory chain (if necessary).
                                    parentDirectory.mkdirs();

                                    IntrusiveList<CrawlTarget> flushList = null;

                                    int desiredLoadAmount = 0;

                                    boolean truncateFile = false;

                                    synchronized (domain) {
                                        if (domain._offlineTargetCount == 0) {
                                            truncateFile = true;
                                        }
                                    }

                                    if (truncateFile && logFilePath.exists()) {

                                        if (Environment.detailLogEnabled())
                                            LOG.info("### DiskThread: Truncating Existing Log File for List:"
                                                    + listName);

                                        LogFileHeader header = new LogFileHeader();

                                        RandomAccessFile file = new RandomAccessFile(logFilePath, "rw");

                                        try {
                                            writeLogFileHeader(file, header);
                                        } finally {
                                            file.close();
                                        }
                                    }

                                    // now lock access to the domain's pending queue
                                    synchronized (domain) {

                                        // if a disk request was pending ...
                                        if (domain._diskRequestPending) {

                                            // reset disk request pending flag here to prevent race condition ...
                                            domain._diskRequestPending = false;

                                            // figure out what action to take with respect to the domain ...

                                            // if list count exceeds flush threshold 
                                            if (domain._pending.size() >= DISK_FLUSH_THRESHOLD
                                                    || domain._queued.size() != 0) {

                                                if (domain._queued.size() == 0) {
                                                    LinkedList<CrawlTarget> candidates = new LinkedList<CrawlTarget>();
                                                    for (CrawlTarget candidate : domain._pending) {
                                                        if ((candidate.getFlags()
                                                                & CrawlURL.Flags.IsHighPriorityURL) == 0) {
                                                            // add candidates in proper order ... 
                                                            candidates.add(candidate);
                                                        }
                                                    }

                                                    // if there are low priority candidates we can flush ... 
                                                    if (candidates.size() != 0) {
                                                        // create a new flush list ... 
                                                        flushList = new IntrusiveList<CrawlTarget>();

                                                        // reverse candidate list and start removing items from pending 
                                                        for (CrawlTarget candidate : Lists.reverse(candidates)) {
                                                            domain._pending.removeElement(candidate);
                                                            flushList.addHead(candidate);
                                                            // if we are back to ideal target count bail ... 
                                                            if (domain._pending.size() <= IDEAL_TARGET_COUNT)
                                                                break;
                                                        }

                                                        if (Environment.detailLogEnabled())
                                                            LOG.info("### DiskThread: List:" + domain.getListName()
                                                                    + " Created FetchList FROM PENDING of Size:"
                                                                    + flushList.size());

                                                        // increment offline target count ...
                                                        domain._offlineTargetCount += flushList.size();
                                                    }
                                                } else {
                                                    flushList = domain._queued.detach(domain._queued.getHead());

                                                    if (Environment.detailLogEnabled())
                                                        LOG.info("### DiskThread: List:" + domain.getListName()
                                                                + " Created FetchList FROM QUEUED of Size:"
                                                                + flushList.size());

                                                    // increment offline target count ...
                                                    domain._offlineTargetCount += flushList.size();

                                                }

                                                /*
                                                // walk one past IDEAL target item count... 
                                                int i=0;
                                                CrawlTarget target = domain._pending.getHead();  
                                                while (i<IDEAL_TARGET_COUNT) { 
                                                  target = target.getNext();
                                                  ++i;
                                                }
                                                    
                                                // and extract a sub-list starting at the target ...  
                                                flushList = domain._pending.detach(target);
                                                */
                                                //and immediately update offline target count in domain ... 
                                                //domain._offlineTargetCount += flushList.size();
                                            }
                                            // otherwise ... 
                                            else {
                                                // check queued size ... 
                                                if (domain._queued.size() != 0) {
                                                    // if pending size <= DISK_LOAD_THRESHOLD 
                                                    if (domain._offlineTargetCount == 0
                                                            && domain._pending.size() <= DISK_LOAD_THRESHOLD) {
                                                        if (Environment.detailLogEnabled())
                                                            LOG.info(
                                                                    "### DiskThread: Moving Items from Queued List to Pending List for CrawlList:"
                                                                            + domain.getListName());
                                                        // move over items from queued to pending 

                                                        while (domain._queued.getHead() != null) {
                                                            domain._pending.addTail(domain._queued.removeHead());
                                                            if (domain._pending
                                                                    .size() == (DISK_FLUSH_THRESHOLD - 1))
                                                                break;
                                                        }
                                                    }

                                                    //now if domain queue exceeds flush threshold ... 
                                                    if (domain._queued.size() >= IDEAL_TARGET_COUNT) {
                                                        if (Environment.detailLogEnabled())
                                                            LOG.info(
                                                                    "### DiskThread: Queued Size Exceed Flush Threshold. Flushing to Disk for CrawlList:"
                                                                            + domain.getListName());
                                                        // extract a sub-list starting at head of queued list   
                                                        flushList = domain._queued.detach(domain._queued.getHead());
                                                        //and immediately update offline target count in domain ... 
                                                        domain._offlineTargetCount += flushList.size();
                                                    }
                                                }

                                                // check to see if a load is desired ...
                                                if (domain._pending.size() <= DISK_LOAD_THRESHOLD) {
                                                    // calculate load amount ... 
                                                    desiredLoadAmount = IDEAL_TARGET_COUNT - domain._pending.size();
                                                }
                                            }
                                        } else {
                                            if (Environment.detailLogEnabled())
                                                LOG.info("### DiskThread: Skipping List:" + domain.getListName());
                                        }
                                    }

                                    // now figure out what to do ... 
                                    if (flushList != null) {
                                        if (Environment.detailLogEnabled())
                                            LOG.info("### DiskThread: Flushing" + flushList.size()
                                                    + " Items To Disk for Domain:" + domain.getListName());
                                        // flush crawl targets to disk ... 
                                        appendTargetsToLogFile(logFilePath, flushList);
                                        // clear list ... 
                                        flushList.removeAll();
                                    }
                                    // ... if load is desired ... 
                                    if (desiredLoadAmount != 0) {

                                        IntrusiveList<CrawlTarget> loadList = new IntrusiveList<CrawlTarget>();

                                        int loadCount = readTargetsFromLogFile(domain, logFilePath,
                                                desiredLoadAmount, loadList);

                                        // if (Environment.detailLogEnabled())
                                        LOG.info("### DiskThread:Disk Queue Loaded: " + loadCount
                                                + "Items To Disk for Domain:" + domain.getListName());

                                        if (loadCount != 0) {
                                            // time to lock domain again ... 
                                            synchronized (domain) {
                                                // and reduce offline count ... 
                                                domain._offlineTargetCount -= loadList.size();
                                                // load new items into domain's list ... 
                                                domain._pending.attach(loadList);
                                            }
                                        }
                                    }
                                } catch (IOException e) {
                                    LOG.error("### DiskThread:" + CCStringUtils.stringifyException(e));
                                }
                            }
                        }
                    } catch (InterruptedException e) {

                    } catch (Exception e) {
                        LOG.fatal("### DiskThread: Encountered Unhandled Exception:"
                                + CCStringUtils.stringifyException(e));
                    }
                }

                LOG.info("### DiskThread: Exiting CrawlDomain Disk Queue Thread");
            }
        });
        // launch the writer thread ... 
        _diskOperationThread.start();
    }

    private static class LogFileHeader {

        public static final int LogFileHeaderBytes = 0xCC00CC00;
        public static final int LogFileVersion = 1;

        public LogFileHeader() {
            _readPos = 0;
            _writePos = 0;
            _itemCount = 0;
        }

        public long _readPos;
        public long _writePos;
        public int _itemCount;

        public void writeHeader(DataOutput stream) throws IOException {
            stream.writeInt(LogFileHeaderBytes);
            stream.writeInt(LogFileVersion);
            stream.writeLong(_diskHeaderActiveVersionTimestamp);
            stream.writeLong(_readPos);
            stream.writeLong(_writePos);
            stream.writeInt(_itemCount);
        }

        public void readHeader(DataInput stream) throws IOException {
            int headerBytes = stream.readInt();
            int version = stream.readInt();
            long timestamp = stream.readLong();

            if (headerBytes != LogFileHeaderBytes && version != LogFileVersion) {
                throw new IOException("Invalid CrawlLog File Header Detected!");
            }

            _readPos = stream.readLong();
            _writePos = stream.readLong();
            _itemCount = stream.readInt();

            // if timestamps don't match ... 
            if (timestamp != _diskHeaderActiveVersionTimestamp) {
                // then reset cursors .. eveything in the file is invalid ... 
                _writePos = 0;
                _readPos = 0;
                _itemCount = 0;
            }
        }
    }

    private static final class CustomByteArrayOutputStream extends ByteArrayOutputStream {
        public CustomByteArrayOutputStream(int initialSize) {
            super(initialSize);
        }

        public byte[] getBuffer() {
            return buf;
        }
    }

    private static void appendTargetsToLogFile(File logFileName, IntrusiveList<CrawlTarget> list)
            throws IOException {

        LogFileHeader header = new LogFileHeader();

        boolean preExistingHeader = logFileName.exists();

        RandomAccessFile file = new RandomAccessFile(logFileName, "rw");

        try {
            long headerOffset = 0;

            if (preExistingHeader) {

                headerOffset = readLogFileHeader(file, header);

                if (header._writePos == 0) {
                    file.seek(headerOffset);
                } else {
                    // seelk to appropriate write position 
                    file.seek(header._writePos);
                }
            } else {
                headerOffset = writeLogFileHeader(file, header);
            }

            CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17);
            DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream);
            CRC32 crc = new CRC32();

            for (CrawlTarget target : list) {

                PersistentCrawlTarget persistentTarget = target.createPersistentTarget();

                bufferOutputStream.reset();
                // write to intermediate stream ... 
                persistentTarget.write(dataOutputStream);
                // and crc the data ... 
                crc.reset();
                crc.update(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
                // write out length first 
                file.writeInt(bufferOutputStream.size());
                //crc next
                long computedValue = crc.getValue();
                //TODO: waste of space - write 32 bit values as long because having problems with java sign promotion rules during read...
                file.writeLong(computedValue);
                // and then the data 
                file.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
            }

            // now update header ... 
            header._itemCount += list.size();
            header._writePos = file.getFilePointer();

            // now write out header anew ... 
            writeLogFileHeader(file, header);

        } finally {
            if (file != null) {
                file.close();
            }
        }
    }

    private static int readTargetsFromLogFile(CrawlList domain, File logFileName, int desiredReadAmount,
            IntrusiveList<CrawlTarget> targetsOut) throws IOException {

        int itemsRead = 0;

        if (logFileName.exists()) {

            RandomAccessFile file = new RandomAccessFile(logFileName, "rw");

            LogFileHeader header = new LogFileHeader();

            try {

                long headerOffset = readLogFileHeader(file, header);

                // seelk to appropriate write position 
                if (header._readPos != 0)
                    file.seek(header._readPos);

                int itemsToRead = Math.min(desiredReadAmount, header._itemCount);

                PersistentCrawlTarget persistentTarget = new PersistentCrawlTarget();
                CRC32 crc = new CRC32();
                CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 16);

                for (int i = 0; i < itemsToRead; ++i) {
                    // read length ... 
                    int urlDataLen = file.readInt();
                    long urlDataCRC = file.readLong();

                    buffer.reset();

                    if (urlDataLen > buffer.getBuffer().length) {
                        buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536);
                    }
                    file.read(buffer.getBuffer(), 0, urlDataLen);
                    crc.reset();
                    crc.update(buffer.getBuffer(), 0, urlDataLen);

                    long computedValue = crc.getValue();

                    // validate crc values ... 
                    if (computedValue != urlDataCRC) {
                        throw new IOException("Crawl Target Log File Corrupt");
                    } else {
                        //populate a persistentTarget from the (in memory) data stream
                        DataInputStream bufferReader = new DataInputStream(
                                new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen));

                        persistentTarget.clear();
                        persistentTarget.readFields(bufferReader);

                        //populate a new crawl target structure ... 
                        CrawlTarget newTarget = new CrawlTarget(domain, persistentTarget);

                        targetsOut.addTail(newTarget);
                    }
                }

                itemsRead = itemsToRead;

                // now update header ... 
                header._itemCount -= itemsRead;
                // now if item count is non zero ... 
                if (header._itemCount != 0) {
                    // set read cursor to next record location 
                    header._readPos = file.getFilePointer();
                }
                // otherwise ... 
                else {
                    // reset both cursors ... 
                    header._readPos = 0;
                    header._writePos = 0;
                }

                // now write out header anew ... 
                writeLogFileHeader(file, header);
            } finally {
                if (file != null) {
                    file.close();
                }
            }
        }
        return itemsRead;
    }

    private static long writeLogFileHeader(RandomAccessFile file, LogFileHeader header) throws IOException {

        // set the position at zero .. 
        file.seek(0);
        // and write header to disk ... 
        header.writeHeader(file);

        //took sync out because it was becoming a sever bottleneck
        // file.getFD().sync();

        return file.getFilePointer();
    }

    private static long readLogFileHeader(RandomAccessFile file, LogFileHeader header) throws IOException {

        file.seek(0);

        header.readHeader(file);

        return file.getFilePointer();
    }

    public static class CrawlDomainTester {

        public static void main(String[] args) {
            try {
                testGetNextItemCode();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        @Test
        public static void testGetNextItemCode() throws Exception {

            /*
            CrawlListHost host = new CrawlListHost(null,1);
            CrawlList list = host.getCrawlList(1);
                
                
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.redirecttest.com"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.redirecttest.com/foobar"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.blogger.com"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://blogger.com"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://www.blogger.com"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://foo.blogger.com"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://failed.domain/bar"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://failed.domain/zzz"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"http://####/foo/zzz"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,"garbage"),false);
            list.addCrawlTarget(CrawlTarget.createTestCrawlTarget(list,""),false);
            list._offlineTargetCount = 100;
                
            CrawlTarget target = null;
            NIOHttpHeaders headers = new NIOHttpHeaders();
            headers.add(null, "HTTP1.1 200 OK");
                
            while (list.getDisposition() != CrawlList.Disposition.QueueEmpty) {
             if (list.getDisposition() == CrawlList.Disposition.ItemAvailable) { 
               target = list.getNextTarget();
               if (target == null)
                 System.out.println("Target:NULL");
               else 
                 System.out.println("Target:" + target.getOriginalURL());
             }
             else if (list.getDisposition() == CrawlList.Disposition.WaitingOnCompletion) { 
               if (target != null) {
                 if (target.getActiveURL().startsWith("http://www.redirecttest.com")) { 
            target.setRedirectURL(target.getActiveURL().replaceFirst("http://www.redirecttest.com", "http://redirecttest.com"));
            target.setFlags(target.getFlags() | CrawlURL.Flags.IsRedirected);
                
                 }
                 list.fetchStarted(target);
                 if (target.getActiveURL().startsWith("http://failed.domain")) { 
            list.getActiveDomain()._domainFailed = true;
                 }
                 list.fetchSucceeded(target,0, headers, null);
                     
                
                 target = null;
               }
               else { 
                 list._disposition = Disposition.WaitingOnTime;
               }
             }
             else if (list.getDisposition() == CrawlList.Disposition.WaitingOnTime) {
               System.out.println("clearing WaitState");
               list.clearWaitState();
             }
            }
            */
        }

        //@Test
        public void testDiskQueue() throws Exception {

            String hostName = "poodleskirtcentral.com";
            long domainFP = URLFingerprint.generate64BitURLFPrint(hostName);
            File logFilePath = FileUtils.buildHierarchicalPathForId(new File("/foo"), domainFP);

            System.out.println(logFilePath.getAbsolutePath());
        }

        /*
        //@Test
        public void testDiskWriter() throws Exception {
          // initialize ...
          Configuration conf = new Configuration();
              
          conf.addResource("nutch-default.xml");
          conf.addResource("nutch-site.xml");
          conf.addResource("hadoop-default.xml");
          conf.addResource("hadoop-site.xml");
          conf.addResource("commoncrawl-default.xml");
          conf.addResource("commoncrawl-site.xml");
              
          CrawlEnvironment.setHadoopConfig(conf);
          CrawlEnvironment.setDefaultHadoopFSURI("file:///");
          CrawlEnvironment.setCrawlSegmentDataDirectory("./tests/crawlSegmentSamples/");
            
          EventLoop eventLoop = new EventLoop();
          eventLoop.start();
              
          DNSCache cache = new DNSCache() { 
            public DNSResult resolveName(CrawlSegmentHost host) {
        return null;
            } 
          };
            
          CrawlList.DISK_FLUSH_THRESHOLD = 5;
          CrawlList.DISK_LOAD_THRESHOLD = 2;
          CrawlList.IDEAL_TARGET_COUNT = 3;
              
          File basePath = new File("./data/diskQueueTest");
          basePath.mkdir();
          CrawlList.startDiskQueueingThread(eventLoop,basePath);
            
          CrawlSegmentDetail detailCC06 = SegmentLoader.loadCrawlSegment(1,1, "cc06",null, cache,null,null);
              
          CrawlListHost host = new CrawlListHost(null,0);
              
          int domainCount = 0;
          CrawlList firstDomain = null;
              
          for (CrawlSegmentHost segmentHost: detailCC06.getHosts()) { 
            CrawlList domain = new CrawlList(host,segmentHost.getListId());
            if (domainCount==0)
        firstDomain = domain;
            if (domainCount ==0)
        System.out.println("Domain:" + domain.getListName() + " FP:" + domain.getListId());
            for (CrawlSegmentURL segmentURL : segmentHost.getUrlTargets()) { 
        if (domainCount ==0)
          System.out.println("\tAdding Target::" + segmentURL.getUrl());
        CrawlTarget target = new CrawlTarget(1,domain,segmentHost,segmentURL);
        domain.addCrawlTarget(target, false);
            }
                
            if (++domainCount == 10) 
        break;
          }
          while (true) { 
            synchronized (firstDomain) { 
        if (firstDomain._pending.size() < CrawlList.DISK_FLUSH_THRESHOLD)
          break;
        Thread.sleep(5000);
            }
          }
              
          while (true) { 
            if (firstDomain.getDisposition() == CrawlList.Disposition.ItemAvailable) { 
        CrawlTarget nextTarget = null;
        while ((nextTarget = firstDomain.getNextTarget()) != null) { 
          System.out.println("Domain: "+ firstDomain.getListName() + " Got Target:" + nextTarget.getOriginalURL());
          firstDomain.fetchStarted(nextTarget);
          firstDomain.fetchSucceeded(nextTarget, 0,null, null);
          if (firstDomain.getDisposition() == CrawlList.Disposition.WaitingOnTime) { 
            firstDomain.clearWaitState();
          }
        }
            }
            else { 
        System.out.println("Domain Queue Empty ... Waiting");
        Thread.sleep(5000);
            }
          }
          //CrawlDomain._diskOperationThread.join();
        }
        */
    }

    @Override
    public String toString() {
        return "List Id:" + _baseListId + " Name:" + _listName;
    }
}