dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java Source code

Introduction

Here is the source code for dk.netarkivet.wayback.batch.copycode.NetarchiveSuiteWARCRecordToSearchResultAdapter.java
Source

/* File:        $Id: NetarchiveSuiteUrlOperations.java 2251 2012-02-08 13:03:03Z mss $
 * Revision:    $Revision: 2251 $
 * Author:      $Author: mss $
 * Date:        $Date: 2012-02-08 14:03:03 +0100 (Wed, 08 Feb 2012) $
 *
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
package dk.netarkivet.wayback.batch.copycode;

import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourcestore.indexer.HTTPRecordAnnotater;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.url.IdentityUrlCanonicalizer;

/**
 * Adapts certain WARCRecords into SearchResults. DNS and response records are
 * mostly straightforward, but SearchResult objects generated from revisit 
 * records contain lots of "placeholder" fields, which are expected to be
 * understood by later processes traversing a stream of SearchResult objects.
 * 
 * This is cut'n'pasted from 
 * org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter
 */
public class NetarchiveSuiteWARCRecordToSearchResultAdapter implements Adapter<WARCRecord, CaptureSearchResult> {

    private static final Logger LOGGER = Logger
            .getLogger(NetarchiveSuiteWARCRecordToSearchResultAdapter.class.getName());

    private static final String VERSION = "0.1.0";
    private static final String WARC_FILEDESC_VERSION = "warc/warcinfo" + VERSION;

    private final static String DEFAULT_VALUE = "-";
    private UrlCanonicalizer canonicalizer = null;
    private HTTPRecordAnnotater annotater = null;

    private boolean processAll = false;

    public NetarchiveSuiteWARCRecordToSearchResultAdapter() {
        canonicalizer = new IdentityUrlCanonicalizer();
        annotater = new HTTPRecordAnnotater();
    }

    /* 
     * This just calls adaptInner, returning null if an Exception is thrown:
     */
    public CaptureSearchResult adapt(WARCRecord rec) {
        try {
            return adaptInner(rec);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        } catch (OutOfMemoryError e) {
            e.printStackTrace();
            return null;
        }
    }

    private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {

        ArchiveRecordHeader header = rec.getHeader();

        String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
        //      if(type.equals(WARCConstants.WARCINFO)) {
        //         LOGGER.info("Skipping record type : " + type);
        //         return null;
        //      }

        CaptureSearchResult result = genericResult(rec);

        if (type.equals(WARCConstants.RESPONSE)) {
            String mime = annotater.transformHTTPMime(header.getMimetype());
            if (mime != null && mime.equals("text/dns")) {
                // close to complete reading, then the digest is legit
                // TODO: DO we want to use the WARC header digest for this?
                rec.close();
                result.setDigest(transformWARCDigest(rec.getDigestStr()));
                result.setMimeType(mime);
            } else {
                result = adaptWARCHTTPResponse(result, rec);
            }
        } else if (type.equals(WARCConstants.REVISIT)) {
            // also set the mime type:
            result.setMimeType("warc/revisit");

        } else if (type.equals(WARCConstants.REQUEST)) {

            if (processAll) {
                // also set the mime type:
                result.setMimeType("warc/request");
            } else {
                result = null;
            }
        } else if (type.equals(WARCConstants.METADATA)) {

            if (processAll) {
                // also set the mime type:
                result.setMimeType("warc/metadata");
            } else {
                result = null;
            }
        } else if (type.equals(WARCConstants.WARCINFO)) {

            result.setMimeType(WARC_FILEDESC_VERSION);

        } else {
            LOGGER.info("Skipping record type : " + type);
        }

        return result;
    }

    // ALL HELPER METHODS BELOW:

    /*
     * Extract all common WARC fields into a CaptureSearchResult. This is the
     * same for all WARC record types:
     *  
     *    file, offset, timestamp, digest, urlKey, originalUrl 
     */
    private CaptureSearchResult genericResult(WARCRecord rec) {

        CaptureSearchResult result = new CaptureSearchResult();

        result.setMimeType(DEFAULT_VALUE);
        result.setHttpCode(DEFAULT_VALUE);
        result.setRedirectUrl(DEFAULT_VALUE);

        ArchiveRecordHeader header = rec.getHeader();

        String file = transformWARCFilename(header.getReaderIdentifier());
        long offset = header.getOffset();

        result.setCaptureTimestamp(transformWARCDate(header.getDate()));
        result.setFile(file);
        result.setOffset(offset);
        result.setDigest(transformWARCDigest(header.getHeaderValue(WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));

        String origUrl = header.getUrl();
        if (origUrl == null) {
            String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
            if (type.equals(WARCConstants.WARCINFO)) {
                String filename = header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME).toString();
                result.setOriginalUrl("filedesc:" + filename);
                result.setUrlKey("filedesc:" + filename);
            } else {
                result.setOriginalUrl(DEFAULT_VALUE);
                result.setUrlKey(DEFAULT_VALUE);
            }

        } else {
            result.setOriginalUrl(origUrl);
            try {
                String urlKey = canonicalizer.urlStringToKey(origUrl);
                result.setUrlKey(urlKey);
            } catch (URIException e) {
                String shortUrl = (origUrl.length() < 100) ? origUrl : origUrl.substring(0, 100);
                LOGGER.warning("FAILED canonicalize(" + shortUrl + "):" + file + " " + offset);
                result.setUrlKey(origUrl);
            }
        }
        return result;
    }

    /**
     * borrowed(copied) from org.archive.io.arc.ARCRecord...
     * 
     * @param bytes Array of bytes to examine for an EOL.
     * @return Count of end-of-line characters or zero if none.
     */
    private int getEolCharsCount(byte[] bytes) {
        int count = 0;
        if (bytes != null && bytes.length >= 1 && bytes[bytes.length - 1] == '\n') {
            count++;
            if (bytes.length >= 2 && bytes[bytes.length - 2] == '\r') {
                count++;
            }
        }
        return count;
    }

    private String transformWARCFilename(String readerIdentifier) {
        String warcName = readerIdentifier;
        int index = warcName.lastIndexOf(File.separator);
        if (index > 0 && (index + 1) < warcName.length()) {
            warcName = warcName.substring(index + 1);
        }
        return warcName;
    }

    private String transformWARCDigest(final Object o) {
        if (o == null) {
            return DEFAULT_VALUE;
        }
        String orig = o.toString();
        if (orig.startsWith("sha1:")) {
            return orig.substring(5);
        }
        return orig;
        //      return (o == null) ? DEFAULT_VALUE : o.toString();
    }

    /*
     * Transform input date to 14-digit timestamp:
     * 2007-08-29T18:00:26Z => 20070829180026
     */
    private static String transformWARCDate(final String input) {

        StringBuilder output = new StringBuilder(14);

        output.append(input.substring(0, 4));
        output.append(input.substring(5, 7));
        output.append(input.substring(8, 10));
        output.append(input.substring(11, 13));
        output.append(input.substring(14, 16));
        output.append(input.substring(17, 19));

        return output.toString();
    }

    /*
     * Currently the WARCReader doesn't parse HTTP headers. This method parses
     * them then calls the common ARC/WARC shared record parsing code, which
     * addresses HTTP headers, and possibly even parses HTML content to look
     * for Robot Meta tags.
     */
    private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec)
            throws IOException {

        ArchiveRecordHeader header = rec.getHeader();
        // need to parse the documents HTTP message and headers here: WARCReader
        // does not implement this... yet..

        byte[] statusBytes = HttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one " + " was expected: "
                    + ((statusBytes == null) ? "(null)" : new String(statusBytes)));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                ARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) {
            throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);
        result.setHttpCode(String.valueOf(status.getStatusCode()));

        Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING);

        annotater.annotateHTTPContent(result, rec, headers, header.getMimetype());

        return result;
    }

    public UrlCanonicalizer getCanonicalizer() {
        return canonicalizer;
    }

    public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
        this.canonicalizer = canonicalizer;
    }

    public boolean isProcessAll() {
        return processAll;
    }

    public void setProcessAll(boolean processAll) {
        this.processAll = processAll;
    }

    /**
     * @return the annotater
     */
    public HTTPRecordAnnotater getAnnotater() {
        return annotater;
    }

    /**
     * @param annotater the annotater to set
     */
    public void setAnnotater(HTTPRecordAnnotater annotater) {
        this.annotater = annotater;
    }
}