uk.bl.wa.indexer.WARCIndexer.java Source code

Introduction

Here is the source code for uk.bl.wa.indexer.WARCIndexer.java
Source

package uk.bl.wa.indexer;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2014 The UK Web Archive
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.TimeZone;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.ProtocolException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpHeaders;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.tika.mime.MediaType;
import org.archive.format.warc.WARCConstants;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
import org.archive.util.ArchiveUtils;
import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;

import uk.bl.wa.analyser.payload.WARCPayloadAnalysers;
import uk.bl.wa.analyser.text.TextAnalysers;
import uk.bl.wa.annotation.Annotations;
import uk.bl.wa.annotation.Annotator;
import uk.bl.wa.extract.LinkExtractor;
import uk.bl.wa.parsers.HtmlFeatureParser;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.util.HashedCachedInputStream;
import uk.bl.wa.util.Instrument;

import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;

/**
 * 
 * Core indexer class that takes a web archive record and generates a Solr record.
 * 
 * TODO Currently a rather crude, monolithic code structure. Should pull the different metadata generation logic out into separate classes or at least methods.
 * 
 * @author Andrew Jackson <Andrew.Jackson@bl.uk>
 * 
 */
public class WARCIndexer {
    private static Log log = LogFactory.getLog(WARCIndexer.class);

    private List<String> url_excludes;
    private List<String> protocol_includes;
    private List<String> response_includes;
    private List<String> record_type_includes;

    private MessageDigest md5 = null;
    private AggressiveUrlCanonicalizer canon = new AggressiveUrlCanonicalizer();

    /** */
    private boolean extractText;
    private boolean storeText;
    private boolean hashUrlId;

    /** Wayback-style URI filtering: */
    private StaticMapExclusionFilterFactory smef = null;

    /** Hook to the solr server: */
    private boolean checkSolrForDuplicates = false;
    private SolrWebServer solrServer = null;

    /** Payload Analysers */
    private long inMemoryThreshold;
    private long onDiskThreshold;
    private WARCPayloadAnalysers wpa;

    /** Text Analysers */
    private TextAnalysers txa;

    /** Annotations */
    private Annotator ant = null;

    // Paired with HtmlFeatureParsers links-extractor
    private final boolean addNormalisedURL;
    private final AggressiveUrlCanonicalizer urlNormaliser = new AggressiveUrlCanonicalizer();

    /* ------------------------------------------------------------ */

    /**
     * Default constructor, with empty configuration.
     */
    public WARCIndexer() throws NoSuchAlgorithmException {
        this(ConfigFactory.parseString(ConfigFactory.load().root().render(ConfigRenderOptions.concise())));
    }

    /**
     * Preferred constructor, allows passing in configuration from execution environment.
     */
    public WARCIndexer(Config conf) throws NoSuchAlgorithmException {
        log.info("Initialising WARCIndexer...");
        try {
            Properties props = new Properties();
            props.load(getClass().getResourceAsStream("/log4j-override.properties"));
            PropertyConfigurator.configure(props);
        } catch (IOException e1) {
            log.error("Failed to load log4j config from properties file.");
        }

        // Optional configurations:
        this.extractText = conf.getBoolean("warc.index.extract.content.text");
        log.info("Extract text = " + extractText);
        this.storeText = conf.getBoolean("warc.index.extract.content.text_stored");
        log.info("Store text = " + storeText);
        this.hashUrlId = conf.getBoolean("warc.solr.use_hash_url_id");
        log.info("hashUrlId = " + hashUrlId);
        addNormalisedURL = conf.hasPath(HtmlFeatureParser.CONF_LINKS_NORMALISE)
                ? conf.getBoolean(HtmlFeatureParser.CONF_LINKS_NORMALISE)
                : HtmlFeatureParser.DEFAULT_LINKS_NORMALISE;
        this.checkSolrForDuplicates = conf.getBoolean("warc.solr.check_solr_for_duplicates");
        if (this.hashUrlId == false && this.checkSolrForDuplicates == true) {
            log.warn(
                    "Checking Solr for duplicates may not work as expected when using the timestamp+md5(URL) key.");
            log.warn("You need to use the payload-hash+md5(URL) key option to resolve revisit records.");
        }
        // URLs to exclude:
        this.url_excludes = conf.getStringList("warc.index.extract.url_exclude");
        // Protocols to include:
        this.protocol_includes = conf.getStringList("warc.index.extract.protocol_include");
        // Response codes to include:
        this.response_includes = conf.getStringList("warc.index.extract.response_include");
        // Record types to include:
        this.record_type_includes = conf.getStringList("warc.index.extract.record_type_include");

        // URL Filtering options:
        if (conf.getBoolean("warc.index.exclusions.enabled")) {
            smef = new StaticMapExclusionFilterFactory();
            smef.setFile(conf.getString("warc.index.exclusions.file"));
            smef.setCheckInterval(conf.getInt("warc.index.exclusions.check_interval"));
            try {
                smef.init();
            } catch (IOException e) {
                log.error("Failed to load exclusions file.");
                throw new RuntimeException(
                        "StaticMapExclusionFilterFactory failed with IOException when loading " + smef.getFile());
            }
        }

        // Instanciate required helpers:
        md5 = MessageDigest.getInstance("MD5");

        // Also hook up to Solr server for queries:
        if (this.checkSolrForDuplicates) {
            log.info("Initialisating connection to Solr...");
            solrServer = new SolrWebServer(conf);
        }

        // Set up hash-cache properties:
        this.inMemoryThreshold = conf.getBytes("warc.index.extract.inMemoryThreshold");
        this.onDiskThreshold = conf.getBytes("warc.index.extract.onDiskThreshold");
        log.info("Hashing & Caching thresholds are: < " + this.inMemoryThreshold + " in memory, < "
                + this.onDiskThreshold + " on disk.");

        // Set up analysers
        log.info("Setting up analysers...");
        this.wpa = new WARCPayloadAnalysers(conf);
        this.txa = new TextAnalysers(conf);

        // Log so it's clear this completed ok:
        log.info("Initialisation of WARCIndexer complete.");
    }

    /**
     * 
     * @param ann
     */
    public void setAnnotations(Annotations ann) {
        this.ant = new Annotator(ann);
    }

    /**
     * @return the checkSolrForDuplicates
     */
    public boolean isCheckSolrForDuplicates() {
        return checkSolrForDuplicates;
    }

    /**
     * @param checkSolrForDuplicates the checkSolrForDuplicates to set
     */
    public void setCheckSolrForDuplicates(boolean checkSolrForDuplicates) {
        this.checkSolrForDuplicates = checkSolrForDuplicates;
    }

    /**
     * This extracts metadata and text from the ArchiveRecord and creates a suitable SolrRecord.
     * 
     * @param archiveName
     * @param record
     * @return
     * @throws IOException
     */
    public SolrRecord extract(String archiveName, ArchiveRecord record) throws IOException {
        return this.extract(archiveName, record, this.extractText);
    }

    /**
     * This extracts metadata from the ArchiveRecord and creates a suitable SolrRecord.
     * Removes the text field if flag set.
     * 
     * @param archiveName
     * @param record
     * @param isTextIncluded
     * @return
     * @throws IOException
     */
    public SolrRecord extract(String archiveName, ArchiveRecord record, boolean isTextIncluded) throws IOException {
        final long start = System.nanoTime();
        ArchiveRecordHeader header = record.getHeader();
        SolrRecord solr = new SolrRecord(archiveName, header);

        if (!header.getHeaderFields().isEmpty()) {
            if (header.getHeaderFieldKeys().contains(HEADER_KEY_TYPE)) {
                if (!checkRecordType((String) header.getHeaderValue(HEADER_KEY_TYPE))) {
                    return null;
                }
            } // else we're processing ARCs

            if (header.getUrl() == null)
                return null;
            String fullUrl = header.getUrl();
            log.debug(
                    "Current heap usage: " + FileUtils.byteCountToDisplaySize(Runtime.getRuntime().totalMemory()));
            log.debug("Processing " + fullUrl + " from " + archiveName);

            // Check the filters:
            if (this.checkProtocol(fullUrl) == false)
                return null;
            if (this.checkUrl(fullUrl) == false)
                return null;
            if (this.checkExclusionFilter(fullUrl) == false)
                return null;

            // --- Basic headers ---

            // Basic metadata:
            solr.setField(SolrFields.SOURCE_FILE, archiveName + "@" + header.getOffset());
            // solr.setField(SolrFields.SOURCE_FILE_OFFSET,
            // "" + header.getOffset());
            byte[] md5digest = md5.digest(fullUrl.getBytes("UTF-8"));
            String md5hex = new String(Base64.encodeBase64(md5digest));
            solr.setField(SolrFields.SOLR_URL, fullUrl);
            if (addNormalisedURL) {
                solr.setField(SolrFields.SOLR_URL_NORMALISED, urlNormaliser.canonicalize(fullUrl));
            }
            // Get the length, but beware, this value also includes the HTTP headers (i.e. it is the payload_length):
            long content_length = header.getLength();

            // Also pull out the file extension, if any:
            solr.addField(SolrFields.CONTENT_TYPE_EXT, parseExtension(fullUrl));
            // Strip down very long URLs to avoid "org.apache.commons.httpclient.URIException: Created (escaped) uuri > 2083"
            // Trac #2271: replace string-splitting with URI-based methods.
            URL url = null;
            if (fullUrl.length() > 2000)
                fullUrl = fullUrl.substring(0, 2000);
            try {
                url = new URL(fullUrl);
            } catch (MalformedURLException e) {
                // Some URIs causing problem, so try the canonicalizer; in which
                // case try with the full URL.
                log.error(e.getMessage());
                try {
                    url = new URL("http://" + canon.urlStringToKey(fullUrl));
                } catch (Exception e2) {
                    // If this fails, abandon all hope.
                    log.error(e2.getMessage());
                    return null;
                }
            }
            // Spot 'slash pages':
            if (url.getPath().equals("/") || url.getPath().equals("")
                    || url.getPath().matches("/index\\.[a-z]+$")) {
                solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE);
                // Spot 'robots.txt':
            } else if (url.getPath().equals("/robots.txt")) {
                solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT);
            } else {
                solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_NORMAL);
            }
            // Record the domain (strictly, the host):
            String host = url.getHost();
            solr.setField(SolrFields.SOLR_HOST, host);
            solr.setField(SolrFields.DOMAIN, LinkExtractor.extractPrivateSuffixFromHost(host));
            solr.setField(SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost(host));

            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#archeaders", start);

            InputStream tikainput = null;

            // Only parse HTTP headers for HTTP URIs
            if (fullUrl.startsWith("http")) {
                // Parse HTTP headers:
                String statusCode = null;
                if (record instanceof WARCRecord) {
                    // There are not always headers! The code should check first.
                    String statusLine = HttpParser.readLine(record, "UTF-8");
                    if (statusLine != null && statusLine.startsWith("HTTP")) {
                        String firstLine[] = statusLine.split(" ");
                        if (firstLine.length > 1) {
                            statusCode = firstLine[1].trim();
                            try {
                                this.processHeaders(solr, statusCode, HttpParser.parseHeaders(record, "UTF-8"));
                            } catch (ProtocolException p) {
                                log.error("ProtocolException [" + statusCode + "]: "
                                        + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@"
                                        + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p);
                            }
                        } else {
                            log.warn("Could not parse status line: " + statusLine);
                        }
                    } else {
                        log.warn("Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME)
                                + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY));
                    }
                    // No need for this, as the headers have already been read from the InputStream (above):
                    // WARCRecordUtils.getPayload(record);
                    tikainput = record;
                } else if (record instanceof ARCRecord) {
                    ARCRecord arcr = (ARCRecord) record;
                    statusCode = "" + arcr.getStatusCode();
                    this.processHeaders(solr, statusCode, arcr.getHttpHeaders());
                    arcr.skipHttpHeader();
                    tikainput = arcr;
                } else {
                    log.error("FAIL! Unsupported archive record type.");
                    return solr;
                }

                // Skip recording non-content URLs (i.e. 2xx responses only please):
                if (this.checkResponseCode(statusCode) == false) {
                    log.debug("Skipping this record based on status code " + statusCode + ": " + header.getUrl());
                    return null;
                }
            }

            // Update the content_length based on what's available:
            content_length = tikainput.available();
            // Record the length:
            solr.setField(SolrFields.CONTENT_LENGTH, "" + content_length);

            // -----------------------------------------------------
            // Headers have been processed, payload ready to cache:
            // -----------------------------------------------------

            // Create an appropriately cached version of the payload, to allow analysis.
            final long hashStreamStart = System.nanoTime();
            HashedCachedInputStream hcis = new HashedCachedInputStream(header, tikainput, content_length);
            tikainput = hcis.getInputStream();
            String hash = hcis.getHash();
            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#hashstreamwrap", hashStreamStart);

            // Prepare crawl date information:
            String waybackDate = (header.getDate().replaceAll("[^0-9]", ""));
            Date crawlDate = getWaybackDate(waybackDate);
            String crawlDateString = parseCrawlDate(waybackDate);

            // Optionally use a hash-based ID to store only one version of a URL:
            String id = null;
            if (hashUrlId) {
                id = hash + "/" + md5hex;
            } else {
                id = waybackDate + "/" + md5hex;
            }
            // Set these last:
            solr.setField(SolrFields.ID, id);
            solr.setField(SolrFields.HASH, hash);

            // -----------------------------------------------------
            // Payload has been cached, ready to check crawl dates:
            // -----------------------------------------------------

            // Query for currently known crawl dates:
            HashSet<Date> currentCrawlDates = new HashSet<Date>();
            if (this.checkSolrForDuplicates && solrServer != null) {
                SolrQuery q = new SolrQuery("id:\"" + id + "\"");
                q.addField(SolrFields.CRAWL_DATES);
                try {
                    QueryResponse results = solrServer.query(q);
                    if (results.getResults().size() > 0) {
                        SolrDocument fr = results.getResults().get(0);
                        if (fr.containsKey(SolrFields.CRAWL_DATES)) {
                            for (Object cds : fr.getFieldValues(SolrFields.CRAWL_DATES)) {
                                currentCrawlDates.add((Date) cds);
                            }
                        }
                    } else {
                        log.debug("No matching entries found.");
                    }
                } catch (SolrServerException e) {
                    e.printStackTrace();
                    // FIXME retry?
                }
            }

            // Is the current date unknown? (inc. no-solr-check case):
            if (!currentCrawlDates.contains(crawlDate)) {
                //  Dates to be merged under the CRAWL_DATES field:
                solr.mergeField(SolrFields.CRAWL_DATES, crawlDateString);
                solr.mergeField(SolrFields.CRAWL_YEARS, extractYear(header.getDate()));
            } else {
                // Otherwise, ensure the all the known dates (i.e. including this one) are copied over:
                for (Date ccd : currentCrawlDates) {
                    solr.addField(SolrFields.CRAWL_DATES, formatter.format(ccd));
                    solr.addField(SolrFields.CRAWL_YEARS, getYearFromDate(ccd));
                }
                // TODO This could optionally skip re-submission instead?
            }

            // Sort the dates and find the earliest:
            List<Date> dateList = new ArrayList<Date>(currentCrawlDates);
            dateList.add(crawlDate);
            Collections.sort(dateList);
            Date firstDate = dateList.get(0);
            solr.getSolrDocument().setField(SolrFields.CRAWL_DATE, firstDate);
            solr.setField(SolrFields.CRAWL_YEAR, getYearFromDate(firstDate));

            // Use the current value as the waybackDate:
            solr.setField(SolrFields.WAYBACK_DATE, waybackDate);

            // If this is a revisit record, we should just return an update to the crawl_dates:
            if (WARCConstants.WARCRecordType.revisit.name().equals(header.getHeaderValue(HEADER_KEY_TYPE))) {
                if (currentCrawlDates.contains(crawlDate)) {
                    return null;
                }
                SolrRecord revisited = new SolrRecord();
                revisited.setField(SolrFields.ID, id);
                revisited.mergeField(SolrFields.CRAWL_DATES, crawlDateString);
                revisited.mergeField(SolrFields.CRAWL_YEARS, extractYear(header.getDate()));
                return revisited;
            }

            // -----------------------------------------------------
            // Apply any annotations:
            // -----------------------------------------------------
            if (ant != null) {
                try {
                    ant.applyAnnotations(url.toURI(), solr.getSolrDocument());
                } catch (URISyntaxException e) {
                    e.printStackTrace();
                    log.error("Failed to annotate " + url + " : " + e);
                }
            }

            // -----------------------------------------------------
            // Payload duplication has been checked, ready to parse:
            // -----------------------------------------------------

            final long analyzeStart = System.nanoTime();
            // Mark the start of the payload.
            tikainput.mark((int) content_length);

            // Pass on to other extractors as required, resetting the stream before each:
            this.wpa.analyse(header, tikainput, solr);
            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#analyzetikainput", analyzeStart);

            // Clear up the caching of the payload:
            hcis.cleanup();

            // Derive normalised/simplified content type:
            processContentType(solr, header, content_length);

            // -----------------------------------------------------
            // Payload analysis complete, now performing text analysis:
            // -----------------------------------------------------

            this.txa.analyse(solr);

            // Remove the Text Field if required
            if (!isTextIncluded) {
                solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);

            } else {
                // Otherwise, decide whether to store or both store and index
                // the text:
                if (storeText == false) {
                    // Copy the text into the indexed (but not stored) field:
                    solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_NOT_STORED,
                            (String) solr.getField(SolrFields.SOLR_EXTRACTED_TEXT).getFirstValue());
                    // Take the text out of the original (stored) field.
                    solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);
                }
            }
        }
        Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation", "WARCIndexer.extract#total", start);
        return solr;
    }

    /**
     * @param firstDate
     * @return
     */
    private synchronized String getYearFromDate(Date date) {
        calendar.setTime(date);
        return Integer.toString(calendar.get(Calendar.YEAR));
    }

    private final Calendar calendar = Calendar.getInstance();

    /* ----------------------------------- */

    private void processHeaders(SolrRecord solr, String statusCode, Header[] httpHeaders) {
        try {
            // This is a simple test that the status code setting worked:
            int statusCodeInt = Integer.parseInt(statusCode);
            if (statusCodeInt < 0 || statusCodeInt > 1000)
                throw new Exception("Status code out of range: " + statusCodeInt);
            // Get the other headers:
            for (Header h : httpHeaders) {
                // Get the type from the server
                if (h.getName().equals(HttpHeaders.CONTENT_TYPE)
                        && solr.getField(SolrFields.CONTENT_TYPE_SERVED) == null) {
                    String servedType = h.getValue();
                    if (servedType.length() > 200)
                        servedType = servedType.substring(0, 200);
                    solr.addField(SolrFields.CONTENT_TYPE_SERVED, servedType);
                }
                // Also, grab the X-Powered-By or Server headers if present:
                if (h.getName().equals("X-Powered-By"))
                    solr.addField(SolrFields.SERVER, h.getValue());
                if (h.getName().equals(HttpHeaders.SERVER))
                    solr.addField(SolrFields.SERVER, h.getValue());
            }
        } catch (NumberFormatException e) {
            log.error("Exception when parsing status code: " + statusCode + ": " + e);
            solr.addParseException("when parsing statusCode", e);
        } catch (Exception e) {
            log.error("Exception when parsing headers: " + e);
            solr.addParseException("when parsing headers", e);
        }
    }

    /**
     * 
     * @param fullUrl
     * @return
     */
    protected static String parseExtension(String fullUrl) {
        if (fullUrl.lastIndexOf("/") != -1) {
            String path = fullUrl.substring(fullUrl.lastIndexOf("/"));
            if (path.indexOf("?") != -1) {
                path = path.substring(0, path.indexOf("?"));
            }
            if (path.indexOf("&") != -1) {
                path = path.substring(0, path.indexOf("&"));
            }
            if (path.indexOf(".") != -1) {
                String ext = path.substring(path.lastIndexOf("."));
                ext = ext.toLowerCase();
                // Avoid odd/malformed extensions:
                // if( ext.contains("%") )
                // ext = ext.substring(0, path.indexOf("%"));
                ext = ext.replaceAll("[^0-9a-z]", "");
                return ext;
            }
        }
        return null;
    }

    /**
     * Timestamp parsing, for the Crawl Date.
     */

    public static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    static {
        formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
    }

    /**
     * Returns a Java Date object representing the crawled date.
     * 
     * @param timestamp
     * @return
     */
    public static Date getWaybackDate(String timestamp) {
        Date date = new Date();
        try {
            if (timestamp.length() == 12) {
                date = ArchiveUtils.parse12DigitDate(timestamp);
            } else if (timestamp.length() == 14) {
                date = ArchiveUtils.parse14DigitDate(timestamp);
            } else if (timestamp.length() == 16) {
                date = ArchiveUtils.parse17DigitDate(timestamp + "0");
            } else if (timestamp.length() >= 17) {
                date = ArchiveUtils.parse17DigitDate(timestamp.substring(0, 17));
            }
        } catch (ParseException p) {
            p.printStackTrace();
        }
        return date;
    }

    /**
     * Returns a formatted String representing the crawled date.
     * 
     * @param waybackDate
     * @return
     */
    protected static String parseCrawlDate(String waybackDate) {
        return formatter.format(getWaybackDate(waybackDate));
    }

    /**
     * 
     * @param timestamp
     * @return
     */
    public static String extractYear(String timestamp) {
        // Default to 'unknown':
        String waybackYear = "unknown";
        String waybackDate = timestamp.replaceAll("[^0-9]", "");
        if (waybackDate != null)
            waybackYear = waybackDate.substring(0, 4);
        // Reject bad values by resetting to 'unknown':
        if ("0000".equals(waybackYear))
            waybackYear = "unknown";
        // Return
        return waybackYear;
    }

    /**
     * 
     * @param solr
     * @param header
     * @param content_length
     */
    private void processContentType(SolrRecord solr, ArchiveRecordHeader header, long content_length) {
        // Get the current content-type:
        String contentType = (String) solr.getFieldValue(SolrFields.SOLR_CONTENT_TYPE);

        // Store the raw content type from Tika:
        solr.setField(SolrFields.CONTENT_TYPE_TIKA, contentType);

        // Also get the other content types:
        MediaType mt_tika = MediaType.parse(contentType);
        if (solr.getField(SolrFields.CONTENT_TYPE_DROID) != null) {
            MediaType mt_droid = MediaType
                    .parse((String) solr.getField(SolrFields.CONTENT_TYPE_DROID).getFirstValue());
            if (mt_tika == null || mt_tika.equals(MediaType.OCTET_STREAM)) {
                contentType = mt_droid.toString();
            } else if (mt_droid.getBaseType().equals(mt_tika.getBaseType())
                    && mt_droid.getParameters().get("version") != null) {
                // Union of results:
                mt_tika = new MediaType(mt_tika, mt_droid.getParameters());
                contentType = mt_tika.toString();
            }
            if (mt_droid.getParameters().get("version") != null) {
                solr.addField(SolrFields.CONTENT_VERSION, mt_droid.getParameters().get("version"));
            }
        }

        // Allow header MIME
        if (contentType != null && contentType.isEmpty()) {
            if (header.getHeaderFieldKeys().contains("WARC-Identified-Payload-Type")) {
                contentType = ((String) header.getHeaderFields().get("WARC-Identified-Payload-Type"));
            } else {
                contentType = header.getMimetype();
            }
        }

        // Determine content type:
        if (contentType != null)
            solr.setField(SolrFields.FULL_CONTENT_TYPE, contentType);

        // If zero-length, then change to application/x-empty for the 'content_type' field.
        if (content_length == 0)
            contentType = "application/x-empty";

        // Content-Type can still be null
        if (contentType != null) {
            // Strip parameters out of main type field:
            solr.setField(SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll(";.*$", ""));

            // Also add a more general, simplified type, as appropriate:
            if (contentType.matches("^image/.*$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image");
            } else if (contentType.matches("^audio/.*$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio");
            } else if (contentType.matches("^video/.*$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video");
            } else if (contentType.matches("^text/htm.*$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html");
            } else if (contentType.matches("^application/pdf.*$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf");
            } else if (contentType.matches("^.*word$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word");
            } else if (contentType.matches("^.*excel$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "excel");
            } else if (contentType.matches("^.*powerpoint$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "powerpoint");
            } else if (contentType.matches("^text/plain.*$")) {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "text");
            } else {
                solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "other");
            }

            // Remove text from JavaScript, CSS, ...
            if (contentType.startsWith("application/javascript") || contentType.startsWith("text/javascript")
                    || contentType.startsWith("text/css")) {
                solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);
            }
        }
    }

    private boolean checkUrl(String url) {
        for (String exclude : url_excludes) {
            if (!"".equals(exclude) && url.matches(".*" + exclude + ".*")) {
                return false;
            }
        }
        return true;
    }

    private boolean checkProtocol(String url) {
        for (String include : protocol_includes) {
            if ("".equals(include) || url.startsWith(include)) {
                return true;
            }
        }
        return false;
    }

    private boolean checkResponseCode(String statusCode) {
        if (statusCode == null)
            return false;
        // Check for match:
        for (String include : response_includes) {
            if ("".equals(include) || statusCode.startsWith(include)) {
                return true;
            }
        }
        // Exclude
        return false;
    }

    private boolean checkRecordType(String type) {
        for (String include : record_type_includes) {
            if (type.equals(include)) {
                return true;
            }
        }
        return false;
    }

    private boolean checkExclusionFilter(String uri) {
        // Default to no exclusions:
        if (smef == null)
            return true;
        // Otherwise:
        ExclusionFilter ef = smef.get();
        CaptureSearchResult r = new CaptureSearchResult();
        // r.setOriginalUrl(uri);
        r.setUrlKey(uri);
        try {
            if (ef.filterObject(r) == ExclusionFilter.FILTER_INCLUDE) {
                return true;
            }
        } catch (Exception e) {
            log.error("Exclusion filtering failed with exception: " + e);
            e.printStackTrace();
        }
        log.debug("EXCLUDING this URL due to filter: " + uri);
        // Exclude:
        return false;
    }

}