org.archive.jbs.arc.ArchiveRecordProxy.java Source code

Introduction

Here is the source code for org.archive.jbs.arc.ArchiveRecordProxy.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.archive.jbs.arc;

import java.io.IOException;

import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;

/**
 * Proxy object for ARC and WARC records.  It reads the headers from
 * either one and extracts out just the metadata we need for parsing.
 * ARC records are mapped to the corresponding WARC record type.
 *
 * This is not a general purpose (W)ARC reading class.  It is tailored
 * to the needs of jbs.Parse.
 *
 * A sizeLimit is passed to the constructor to limit the number of
 * bytes that are read from the record body, to avoid blowing up the
 * heapspace.  Some (W)ARC records can have 100+GB record bodies --
 * for example video stream captures.
 *
 * Note that for HTTP response records, the 'length' field in this
 * proxy object is the length of the the HTTP response *body*.  The
 * length stored in the (W)ARC header includes the HTTP status line
 * and headers, which we don't want.
 *
 * For other record types, the length as given in the (W)ARC record
 * header information.
 */
public class ArchiveRecordProxy {
    private String warcRecordType;
    private String warcContentType;
    private String url;
    private String digest;
    private String date;
    private long length;
    private String code;
    private byte[] body;

    /**
     * Construct an ARCRecord proxy.  Read at most sizeLimit
     * bytes from the record body.
     */
    public ArchiveRecordProxy(ARCRecord arc, int sizeLimit) throws IOException {
        ARCRecordMetaData header = (ARCRecordMetaData) arc.getHeader();

        this.url = header.getUrl();
        // No digest until after the record is fully read.
        this.date = header.getDate();
        this.code = header.getStatusCode();

        if (url.startsWith("http")) {
            this.warcRecordType = WARCConstants.WARCRecordType.response.toString();
            this.warcContentType = WARCConstants.HTTP_RESPONSE_MIMETYPE;

            // Move the file position past the HTTP headers to the start of
            // the HTTP response body.
            arc.skipHttpHeader();

            // The length of the HTTP response body is equal to the number
            // of bytes remaining in the arc record.
            // UNLESS the HTTP response body is empty, in which case the
            // arc.getPosition() is already one byte past the
            // header.getLength().
            this.length = header.getLength() - arc.getPosition();
            if (this.length == -1)
                this.length = 0;

            this.body = readBytes(arc, this.length, sizeLimit);
        } else if (url.startsWith("filedesc")) {
            this.warcRecordType = WARCConstants.WARCRecordType.warcinfo.toString();
            this.warcContentType = "application/arcinfo";
        } else if (url.startsWith("dns:")) {
            this.warcRecordType = WARCConstants.WARCRecordType.response.toString();
            this.warcContentType = "text/dns";
        } else if (url.startsWith("ftp:")) {
            this.warcRecordType = WARCConstants.WARCRecordType.resource.toString();

            // The mime-type in the ARC header tells us whether this is part of the 
            // control conversation or the actual downloaded file.
            if ("text/plain".equals(header.getMimetype())) {
                this.warcContentType = "text/plain";
            } else {
                this.warcContentType = "application/octet-stream";
            }

            // HACK: We set a bogus HTTP status code 200 here because
            //       later in the indexing workflow, non-200 codes are
            //       filtered out so that we don't index 404 pages and
            //       such.
            this.code = "200";

            this.length = header.getLength() - arc.getPosition();
            if (this.length == -1)
                this.length = 0;

            this.body = readBytes(arc, this.length, sizeLimit);
        } else {
            throw new IOException("Unknown ARC record type: " + url);
        }

        // Must "close" the record to complete the digest computation.
        arc.close();
        this.digest = "sha1:" + arc.getDigestStr();
    }

    /**
     * Construct an WARCRecord proxy.  Read at most sizeLimit
     * bytes from the record body.
     */
    public ArchiveRecordProxy(WARCRecord warc, int sizeLimit) throws IOException {
        ArchiveRecordHeader header = warc.getHeader();

        this.warcRecordType = (String) header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
        this.warcContentType = (String) header.getHeaderValue(WARCConstants.CONTENT_TYPE);

        this.url = header.getUrl();
        this.digest = (String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST);

        // Convert to familiar YYYYMMDDHHMMSS format
        String warcDate = (String) header.getHeaderValue(WARCConstants.HEADER_KEY_DATE);
        this.date = new StringBuilder().append(warcDate, 0, 4).append(warcDate, 5, 7).append(warcDate, 8, 10)
                .append(warcDate, 11, 13).append(warcDate, 14, 16).append(warcDate, 17, 19).toString();

        // Check if HTTP (not dns or the like) and then read the HTTP
        // headers so that the file position is at the response body
        if (WARCConstants.HTTP_RESPONSE_MIMETYPE.equals(this.warcContentType)) {
            // Sometimes an HTTP response will have whitespace (such as
            // blank lines) before the actual HTTP status line like:
            //   [blank]
            //   HTTP/1.0 200 OK
            // so we have to gobble them up.
            String line;
            while ((line = HttpParser.readLine(warc, "utf-8")) != null) {
                line = line.trim();

                // If an empty line, or an invalid HTTP-status line: skip it!
                if (line.length() == 0)
                    continue;
                if (!line.startsWith("HTTP"))
                    continue;

                try {
                    // Now get on with parsing the status line.
                    StatusLine statusLine = new StatusLine(line);
                    this.code = Integer.toString(statusLine.getStatusCode());
                    break;
                } catch (HttpException e) {
                    // The line started with "HTTP", but was not a full,
                    // valid HTTP-Status line.  Assume that we won't see
                    // one, so break out of the loop.
                    // But first, set the HTTP code to a value indicating
                    // there was no valid HTTP code.
                    this.code = "";
                    break;
                }
            }

            // Skip over the HTTP headers, we just want the body of the HTTP response.
            skipHttpHeaders(warc);

            // The length of the HTTP response body is equal to the number
            // of bytes remaining in the WARC record.
            this.length = header.getLength() - warc.getPosition();

            this.body = readBytes(warc, this.length, sizeLimit);
        } else if (WARCConstants.WARCRecordType.resource.toString().equals(this.warcRecordType) &&
        // We check for "ftp://" here because we don't want to waste the time to copy the
        // bytes for resource record types we don't care about.  If we want to pass along
        // all resource records, simply remove this check.
                this.url.startsWith("ftp://")) {
            // HACK: We set a bogus HTTP status code 200 here because
            //       later in the indexing workflow, non-200 codes are
            //       filtered out so that we don't index 404 pages and
            //       such.
            this.code = "200";

            // The length of the FTP response body is equal to the number
            // of bytes remaining in the WARC record.
            this.length = header.getLength() - warc.getPosition();

            this.body = readBytes(warc, this.length, sizeLimit);
        }

    }

    /**
     * Skip over the HTTP headers.  We use a simple 3-state FSM to
     * search for the byte sequence: \n( |\r)*\n
     *
     * Rather than use the Apache HttpParser.read* methods, I
     * implemented this here to avoid any assumptions that class makes
     * about character encoding.
     *
     * And for sure, avoid HttpParser.parseHeaders() as it is strict
     * about the header format and will throw an exception if they are
     * malformed.  We don't care, we just want to skip them.
     */
    private void skipHttpHeaders(WARCRecord warc) throws IOException {
        int ch;
        int state = 0;
        while ((state != 2) && (ch = warc.read()) >= 0) {
            switch (state) {
            case 0:
                if (ch == '\n')
                    state = 1;
                break;

            case 1:
                if (ch == '\n')
                    state = 2;
                else if (ch == '\r' || ch == ' ')
                    state = 1;
                else
                    state = 0;
                break;
            }
        }
    }

    /**
     * Read the bytes of the record into a buffer and return the buffer.
     * Give a size limit to the buffer to prevent from exploding memory,
     * but still read all the bytes from the stream even if the buffer
     * is full.  This way, the file position will be advanced to the end
     * of the record.
     */
    private byte[] readBytes(ArchiveRecord record, long contentLength, int sizeLimit) throws IOException {
        // Ensure the record does strict reading.
        record.setStrict(true);

        sizeLimit = (int) Math.min(sizeLimit, contentLength);

        byte[] bytes = new byte[sizeLimit];

        if (sizeLimit == 0) {
            return bytes;
        }

        // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT over-ride
        //       the implementation inherited from InputStream.  And since it does
        //       not over-ride it, it won't do the digesting on it.  Must use either
        //       read(byte[],offset,length) or read().
        int pos = 0;
        int c = 0;
        while (((c = record.read(bytes, pos, (bytes.length - pos))) != -1) && pos < bytes.length) {
            pos += c;
        }

        // Now that the bytes[] buffer has been filled, read the remainder
        // of the record so that the digest is computed over the entire
        // content.
        byte[] buf = new byte[1024 * 1024];
        long count = 0;
        while (record.available() > 0) {
            count += record.read(buf, 0, Math.min(buf.length, record.available()));
        }

        // Sanity check.  The number of bytes read into our bytes[]
        // buffer, plus the count of extra stuff read after it should
        // equal the contentLength passed into this function.
        if (pos + count != contentLength) {
            throw new IOException("Incorrect number of bytes read from ArchiveRecord: expected=" + contentLength
                    + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count);
        }

        return bytes;
    }

    public String getWARCRecordType() {
        return this.warcRecordType;
    }

    public String getWARCContentType() {
        return this.warcContentType;
    }

    public String getUrl() {
        return this.url;
    }

    public String getDigest() {
        return this.digest;
    }

    public String getDate() {
        return this.date;
    }

    public long getLength() {
        return this.length;
    }

    public String getHttpStatusCode() {
        return this.code;
    }

    public byte[] getHttpResponseBody() {
        return this.body;
    }

}