is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java Source code

Java tutorial

Introduction

Here is the source code for is.landsbokasafn.deduplicator.indexer.WarcFileIterator.java

Source

/* Copyright (C) 2006-2014 National and University Library of Iceland (NULI)
 * 
 * This file is part of the DeDuplicator (Heritrix add-on module).
 * 
 *  NULI licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package is.landsbokasafn.deduplicator.indexer;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.io.warc.WARCRecord;

public class WarcFileIterator implements CrawlDataIterator {

    private File warcFile;
    private Iterator<ArchiveRecord> recordIterator = null;
    private WARCReader reader = null;

    private CrawlDataItem nextItem = null;

    public WarcFileIterator() {
    }

    /**
     * Convenience constructor. Equivalent to using no-arg constructor and than invoking initialize with the 
     * same parameter.
     * @param source The WARC file to iterate over
     * @throws IOException
     */
    public WarcFileIterator(String source) throws IOException {
        initialize(source);
    }

    public void initialize(String source) throws IOException {
        warcFile = new File(source);
        if (!warcFile.exists()) {
            throw new IllegalArgumentException("No such file " + warcFile.getAbsolutePath());
        }
        readNextItem();
    }

    private void readNextItem() throws IOException {
        // Invalidate any previous items
        nextItem = null;
        // Open file if needed
        if (reader == null) {
            try {
                reader = WARCReaderFactory.get(warcFile);
                recordIterator = reader.iterator();
            } catch (IOException e) {
                System.out.println("Failed to open and read " + warcFile.getAbsolutePath());
                return;
            }
        }
        // Find next response record
        while (recordIterator.hasNext() && nextItem == null) {
            WARCRecord record = (WARCRecord) recordIterator.next();
            ArchiveRecordHeader header = record.getHeader();

            if (header.getUrl() == null || !header.getUrl().startsWith("http")) {
                continue;
            }

            WARCRecordType type = WARCRecordType
                    .valueOf(header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString());

            switch (type) {
            case response:
                nextItem = processResponse(record, header);
                break;
            case revisit:
                nextItem = processRevisit(record, header);
                break;
            default:
                // For anything else, do nothing
            }
        }
        if (!recordIterator.hasNext()) {
            try {
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    protected static CrawlDataItem processResponse(WARCRecord record, ArchiveRecordHeader header)
            throws IOException {
        CrawlDataItem cdi = new CrawlDataItem();
        cdi.setURL(header.getUrl());
        cdi.setContentDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));
        cdi.setRevisit(false);
        cdi.setTimestamp(header.getDate());
        cdi.setWarcRecordId((String) header.getHeaderValue(WARCConstants.HEADER_KEY_ID));

        // Process the HTTP header, if any
        byte[] statusBytes = HttpParser.readRawLine(record);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount > 0) {
            String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount,
                    WARCConstants.DEFAULT_ENCODING);
            if ((statusLine != null) && StatusLine.startsWithHTTP(statusLine)) {
                StatusLine status = new StatusLine(statusLine);
                cdi.setStatusCode(status.getStatusCode());
                Header[] headers = HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING);
                for (Header h : headers) {
                    if (h.getName().equalsIgnoreCase("Content-Type")) {
                        cdi.setMimeType(h.getValue());
                    } else if (h.getName().equalsIgnoreCase("ETag")) {
                        cdi.setEtag(h.getValue());
                    }
                }
            }
        }

        return cdi;
    }

    private CrawlDataItem processRevisit(WARCRecord record, ArchiveRecordHeader header) throws IOException {
        CrawlDataItem cdi = processResponse(record, header);
        cdi.setOriginalURL((String) header.getHeaderValue(WARCConstants.HEADER_KEY_REFERS_TO_TARGET_URI));
        cdi.setOriginalTimestamp((String) header.getHeaderValue(WARCConstants.HEADER_KEY_REFERS_TO_DATE));
        cdi.setRevisitProfile((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PROFILE));
        if (!cdi.getRevisitProfile().equals(WARCConstants.PROFILE_REVISIT_NOT_MODIFIED)) {
            // ETags are of questionable value in this scenario, null it out, if any
            cdi.setEtag(null);
        }

        cdi.setRevisit(true);

        return cdi;
    }

    @Override
    public boolean hasNext() {
        return nextItem != null;
    }

    @Override
    public CrawlDataItem next() throws IOException {
        CrawlDataItem next = nextItem;
        readNextItem();
        return next;
    }

    @Override
    public void close() throws IOException {
        if (reader != null) {
            reader.close();
        }
    }

    /**
     * @param bytes Array of bytes to examine for an EOL.
     * @return Count of end-of-line characters or zero if none.
     * 
     * Borrowed from {@link org.archive.io.arc.ARCRecord}
     */
    private static int getEolCharsCount(byte[] bytes) {
        int count = 0;
        if (bytes != null && bytes.length >= 1 && bytes[bytes.length - 1] == '\n') {
            count++;
            if (bytes.length >= 2 && bytes[bytes.length - 2] == '\r') {
                count++;
            }
        }
        return count;
    }

    @Override
    public String getSourceType() {
        return "Iterator over a single WARC (ISO-28500) file.";
    }

}