eu.scape_project.tb.wc.archd.hdreader.ArcRecordReader.java Source code

Introduction

Here is the source code for eu.scape_project.tb.wc.archd.hdreader.ArcRecordReader.java
Source

/*
 *  Copyright 2012 The SCAPE Project Consortium.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */

package eu.scape_project.tb.wc.archd.hdreader;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;

import static org.archive.io.warc.WARCConstants.*;

/**
 *
 * @author shsdev https://github.com/shsdev
 * @version 0.2
 */
public final class ArcRecordReader extends RecordReader<Text, ArcRecord> {

    private Iterator<ArchiveRecord> recordIterator;
    private long position = 0;
    private long fileLength = 0;
    private ArcRecord record;
    private static final SimpleDateFormat arcDateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
    private static final SimpleDateFormat warcDateformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    ArchiveRecord nativeArchiveRecord;
    private Text currentKey;
    private ArcRecord currentArcRecord;

    /**
     * This constructor is disabled
     *
     * @param fileSplit
     * @param tac
     */
    private ArcRecordReader(FileSplit fileSplit, TaskAttemptContext tac) {
        // disabled
    }

    /**
     * This constructor will be used for hadoop initialisation
     */
    public ArcRecordReader() {
    }

    @Override
    public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
        //throw new UnsupportedOperationException("Unused.");

        FileSplit fileSplit = (FileSplit) is;
        try {
            Path path = fileSplit.getPath();

            FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

            FSDataInputStream fileInputStream = fileSystem.open(path);
            FileStatus fileStatus = fileSystem.getFileStatus(path);
            fileLength = fileStatus.getLen();
            ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
            recordIterator = reader.iterator();

            currentKey = new Text();
            currentArcRecord = new ArcRecord();
        } catch (IOException ex) {
            Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (recordIterator.hasNext()) {
            nativeArchiveRecord = recordIterator.next();
            String idStr = getID(nativeArchiveRecord);
            currentKey.set(idStr);
            return true;
        }
        return false;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return this.currentKey;
    }

    @Override
    public ArcRecord getCurrentValue() throws IOException, InterruptedException {
        currentArcRecord.clear();
        long recordLength = nativeArchiveRecord.getHeader().getLength();
        long contentBegin = nativeArchiveRecord.getHeader().getContentBegin();
        if (contentBegin < 0) {
            contentBegin = 0;
        }
        long positionInFile = nativeArchiveRecord.getHeader().getOffset();
        long contentSize = recordLength - contentBegin;
        currentArcRecord.setUrl(getResourceUrl(nativeArchiveRecord));
        //currentArcRecord.setMimeType(nativeArchiveRecord.getHeader().getMimetype());
        currentArcRecord.setDate(getResourceDate(nativeArchiveRecord));
        currentArcRecord.setType(getType(nativeArchiveRecord));
        Header[] headers = getHttpHeaders(nativeArchiveRecord);
        currentArcRecord.setHttpReturnCode(getHttpReturnCode(nativeArchiveRecord, headers));
        currentArcRecord.setMimeType(getMimeType(nativeArchiveRecord, headers)); // to support ARC and WARC
        nativeArchiveRecord.skip(contentBegin);
        currentArcRecord.setContents(nativeArchiveRecord, (int) contentSize);
        position = positionInFile;
        return currentArcRecord;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return ((position + 0.0f) / fileLength);
    }

    @Override
    public void close() throws IOException {
    }

    private String getType(ArchiveRecord nativeRecord) {
        if (nativeRecord instanceof WARCRecord) {
            WARCRecord warcRecord = (WARCRecord) nativeRecord;
            return warcRecord.getHeader().getHeaderValue(HEADER_KEY_TYPE).toString();
        } else {
            return "response";
        }
    }

    private String getID(ArchiveRecord nativeRecord) {
        if (nativeRecord instanceof ARCRecord) {
            ARCRecord arcRecord = (ARCRecord) nativeRecord;
            ArchiveRecordHeader header = nativeRecord.getHeader();
            return header.getRecordIdentifier();
        } else if (nativeRecord instanceof WARCRecord) {
            WARCRecord warcRecord = (WARCRecord) nativeRecord;
            return warcRecord.getHeader().getHeaderValue(HEADER_KEY_ID).toString();
        }
        return getResourceUrl(nativeRecord);

    }

    private Header[] getHttpHeaders(ArchiveRecord nativeRecord) throws IOException {
        if (nativeRecord instanceof ARCRecord) {
            return ((ARCRecord) nativeRecord).getHttpHeaders();
        } else if (nativeRecord instanceof WARCRecord) {
            WARCRecord warcRecord = (WARCRecord) nativeRecord;
            if (warcRecord.hasContentHeaders()) {
                Header[] headers = HttpParser.parseHeaders(nativeRecord, DEFAULT_ENCODING);
                return headers;
            }
        }
        return new Header[0];
    }

    private int getHttpReturnCode(ArchiveRecord nativeRecord, Header[] headers) throws IOException {
        if (nativeRecord instanceof ARCRecord) {
            ARCRecord arcRecord = (ARCRecord) nativeRecord;
            return arcRecord.getStatusCode();
        }

        //first line is of the format   HttpClient-Bad-Header-Line-Failed-Parse : HTTP/1.0 200 OK
        if (headers != null && headers.length >= 1) {
            Header firstHeader = headers[0];
            if (firstHeader.getName().equals("HttpClient-Bad-Header-Line-Failed-Parse")) {
                if (firstHeader.getValue().startsWith("HTTP/1.")) {
                    //We have a http response header
                    String[] elements = firstHeader.getValue().split(" ");
                    if (elements.length == 3) {
                        String codeString = elements[1];
                        int returnCode = Integer.parseInt(codeString);
                        return returnCode;
                    }
                }
            }
        }
        return -1;

    }

    private Date getResourceDate(ArchiveRecord nativeRecord) throws IOException {
        try {
            if (nativeRecord instanceof ARCRecord) {
                return arcDateFormat.parse(nativeRecord.getHeader().getDate());
            } else {
                return warcDateformat.parse(nativeRecord.getHeader().getHeaderValue(HEADER_KEY_DATE).toString());
            }
        } catch (ParseException e) {
            throw new IOException("Failed to parse the date", e);
        }
    }

    private String getResourceUrl(ArchiveRecord nativeRecord) {
        if (nativeRecord instanceof ARCRecord) {
            return nativeRecord.getHeader().getUrl();
        } else {
            Object url = nativeRecord.getHeader().getHeaderValue(HEADER_KEY_URI);
            if (url != null) {
                return url.toString();
            }
        }
        return null;
    }

    private String getMimeType(ArchiveRecord nativeRecord, Header[] headers) {

        // *** 4 cases are covered here
        // 1) ARCRecord
        // 2) WARCRecord with a HTTPHeader (WARC RESPONSE records)
        // 3) WARCRecord - no HttpHeader (WARCINFO record [the WARC container header] and DNS records)
        // 4) Neither a ARCRecord nor a WARCRecord (REQUEST and METADATA records of WARCs)
        // *** 1, 3, 4 do return the record MIME TYPE (which is is the content MIME TYPE of ARC records and the record MIME TYPE of WARC REQUEST and METADATA records)
        // *** 2 returns the MIME TYPE stored in the HTTPHeader of the RESPONSE (content) record.
        //          Otherwise this record returns: "application/http; msgtype=response") - which is true too but not the information we want. We want to see the MIME TYPE of the content stream as the result.

        // CASE 2:
        if (nativeRecord instanceof WARCRecord) {
            if (headers != null && headers.length >= 1) {
                String currentHeaderName;
                for (Header currentHeader : headers) {
                    currentHeaderName = currentHeader.getName().toLowerCase();
                    if (currentHeaderName.equals("content-type")) {
                        return currentHeader.getValue();
                    }
                }
            }
        }

        // CASE 1, 3, 4:
        return nativeRecord.getHeader().getMimetype();

    }
}