it.unimi.di.law.warc.io.AbstractWarcReader_NYU.java Source code

Java tutorial

Introduction

Here is the source code for it.unimi.di.law.warc.io.AbstractWarcReader_NYU.java

Source

package it.unimi.di.law.warc.io;

/*       
 * Copyright (C) 2013 Paolo Boldi, Massimo Santini, and Sebastiano Vigna 
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

// RELEASE-STATUS: DIST

import it.unimi.di.law.warc.records.AbstractWarcRecord;
import it.unimi.di.law.warc.records.WarcHeader;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.di.law.warc.util.BoundSessionInputBuffer;
import org.apache.http.Header;
import org.apache.http.HttpException;
import org.apache.http.ParseException;
import org.apache.http.ProtocolVersion;
import org.apache.http.impl.io.AbstractMessageParser;
import org.apache.http.impl.io.HttpTransportMetricsImpl;
import org.apache.http.impl.io.SessionInputBufferImpl;
import org.apache.http.io.SessionInputBuffer;
import org.apache.http.message.BasicLineParser;
import org.apache.http.message.HeaderGroup;
import org.apache.http.message.LineParser;
import org.apache.http.message.ParserCursor;
import org.apache.http.util.CharArrayBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;

public abstract class AbstractWarcReader_NYU implements WarcReader {
    private static final boolean VERSION = Boolean
            .parseBoolean(System.getProperty("it.unimi.di.law.warc.io.version", "true"));
    private static final Logger LOGGER = LoggerFactory.getLogger(AbstractWarcReader_NYU.class);

    private static final int BUFFER_SIZE = 1024;

    private final CharArrayBuffer line = new CharArrayBuffer(BUFFER_SIZE);
    private final HttpTransportMetricsImpl metrics = new HttpTransportMetricsImpl();
    private final LineParser parser = new BasicLineParser(WarcRecord.PROTOCOL_VERSION);

    private SessionInputBuffer buffer;
    private BoundSessionInputBuffer payload = null;
    private ProtocolVersion version;
    private boolean skip;

    protected void setInput(final InputStream input) {
        final SessionInputBufferImpl bufferImpl = new SessionInputBufferImpl(metrics, BUFFER_SIZE, 0, null, null);
        bufferImpl.bind(input);
        this.buffer = bufferImpl;
        this.payload = null;
        this.skip = false;
    }

    private ProtocolVersion parseHead() throws IOException {
        this.line.clear();
        int read = this.buffer.readLine(this.line);
        if (LOGGER.isTraceEnabled())
            LOGGER.trace("Protocol header '{}'.", new String(this.line.toCharArray()));
        if (read == -1)
            return null;
        ParserCursor cursor = new ParserCursor(0, this.line.length());
        try {
            return parser.parseProtocolVersion(this.line, cursor);
        } catch (ParseException e) {
            throw new WarcFormatException("Can't parse WARC version header.", e);
        }
    }

    protected WarcRecord read(final boolean consecutive) throws IOException {

        if (consecutive && this.payload != null) {
            this.payload.consume();
            this.payload = null;
            this.line.clear();

            // check WARC version
            if (!(this.version.getMajor() == 0 && this.version.getMinor() == 18)) {
                this.buffer.readLine(this.line);
                this.buffer.readLine(this.line);
            } else {
                if (!this.skip) {
                    this.buffer.readLine(this.line);
                    this.skip = true;
                }
            }
            if (line.length() != 0)
                throw new WarcFormatException("Missing CRLFs at WARC record end, got \"" + line + "\"");
            this.line.clear();
        }

        // first header line
        this.version = parseHead();
        if (this.version == null)
            return null;
        if (VERSION && (this.version.getMajor() != 1 || this.version.getMinor() != 0))
            throw new IllegalArgumentException("Unsupported WARC version " + this.version);

        // rest of headers

        final HeaderGroup warcHeaders = new HeaderGroup();
        try {
            warcHeaders.setHeaders(AbstractMessageParser.parseHeaders(this.buffer, -1, -1, null));
        } catch (HttpException e) {
            throw new WarcFormatException("Can't parse WARC headers", e);
        }

        // payload

        final Header payloadLengthHeader = WarcHeader.getFirstHeader(warcHeaders, WarcHeader.Name.CONTENT_LENGTH);
        if (payloadLengthHeader == null)
            throw new WarcFormatException("Missing 'Content-Length' WARC header");
        long payloadLength = -1;
        try {
            payloadLength = Long.parseLong(payloadLengthHeader.getValue());
        } catch (NumberFormatException e) {
            throw new WarcFormatException(
                    "Can't parse 'Content-Length' WARC header (is \"" + payloadLengthHeader.getValue() + "\")", e);
        }
        this.payload = new BoundSessionInputBuffer(this.buffer, payloadLength);

        return AbstractWarcRecord.fromPayload(warcHeaders, this.payload);
    }

}