uk.bl.wa.util.HashedCachedInputStream.java Source code

Java tutorial

Introduction

Here is the source code for uk.bl.wa.util.HashedCachedInputStream.java

Source

/**
 * 
 */
package uk.bl.wa.util;

/*
 * #%L
 * warc-indexer
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

import org.apache.commons.codec.digest.MessageDigestAlgorithms;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.format.warc.WARCConstants;
import org.archive.io.ArchiveRecordHeader;
import org.archive.util.Base32;
import org.jwat.common.RandomAccessFileInputStream;

/**
 * Utility method that takes a given input stream and caches the
 * content in RAM, on disk, based on some size limits.
 * 
 * Also calculates the hash of the whole stream.
 * 
 * @author anj
 *
 */
public class HashedCachedInputStream {
    private static Log log = LogFactory.getLog(HashedCachedInputStream.class);

    private MessageDigest digest = null;

    private String headerHash = null;

    private String hash = null;

    private boolean inMemory;

    private File cacheFile;
    private RandomAccessFile RAFcache;
    private String url;

    private byte[] cacheBytes;

    private boolean truncated = false;

    // Thresholds:
    private long inMemoryThreshold = 1024 * 1024; // Up to 1MB allowed in RAM.
    private long onDiskThreshold = 1024 * 1024 * 100; // Up to 100MB cached on disk. 

    /**
     * 
     * @param header
     * @param in
     * @param length
     * @param inMemoryThreshold
     * @param onDiskThreshold
     */
    public HashedCachedInputStream(ArchiveRecordHeader header, InputStream in, long length, long inMemoryThreshold,
            long onDiskThreshold) {
        this.inMemoryThreshold = inMemoryThreshold;
        this.onDiskThreshold = onDiskThreshold;
        init(header, in, length);
    }

    /**
     * Constructo, processed payload for hash and makes content available.
     * 
     * @param header
     * @param in
     * @param length
     */
    public HashedCachedInputStream(ArchiveRecordHeader header, InputStream in, long length) {
        init(header, in, length);
    }

    /**
     * @param header
     * @param in
     * @param length
     */
    private void init(ArchiveRecordHeader header, InputStream in, long length) {
        url = Normalisation.sanitiseWARCHeaderValue(header.getUrl());
        try {
            digest = MessageDigest.getInstance(MessageDigestAlgorithms.SHA_1);
        } catch (NoSuchAlgorithmException e) {
            log.error("Hashing: " + url + "@" + header.getOffset(), e);
        }

        try {
            if (header.getHeaderFieldKeys().contains(HEADER_KEY_PAYLOAD_DIGEST)) {
                headerHash = (String) header.getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST);
            }

            // Create a suitable outputstream for caching the content:
            OutputStream cache = null;
            if (length < inMemoryThreshold) {
                inMemory = true;
                cache = new ByteArrayOutputStream();
            } else {
                inMemory = false;
                cacheFile = File.createTempFile("warc-indexer", ".cache");
                cacheFile.deleteOnExit();
                cache = new FileOutputStream(cacheFile);
            }

            DigestInputStream dinput = new DigestInputStream(in, digest);

            long toCopy = length;
            if (length > this.onDiskThreshold) {
                toCopy = this.onDiskThreshold;
            }
            IOUtils.copyLarge(dinput, cache, 0, toCopy);
            cache.close();

            // Read the remainder of the stream, to get the hash.
            if (length > this.onDiskThreshold) {
                truncated = true;
                IOUtils.skip(dinput, length - this.onDiskThreshold);
            }

            hash = "sha1:" + Base32.encode(digest.digest());

            // For response records, check the hash is consistent with any header hash:
            if ((headerHash != null) && (hash.length() == headerHash.length())) {
                if (header.getHeaderFieldKeys().contains(HEADER_KEY_TYPE) && header.getHeaderValue(HEADER_KEY_TYPE)
                        .equals(WARCConstants.WARCRecordType.response.toString())) {
                    if (!headerHash.equals(hash)) {
                        log.error("Hashes are not equal for this input!");
                        log.error(" - payload hash from header = " + headerHash);
                        log.error(" - payload hash from content = " + hash);
                        throw new RuntimeException("Hash check failed!");
                    } else {
                        log.debug("Hashes were found to match for " + url);
                    }
                } else {
                    // For revisit records, use the hash of the revisited payload:
                    // TODO this should actually only do it for revisit type records.
                    this.hash = this.headerHash;
                }
            }

            // Now set up the inputStream
            if (inMemory) {
                this.cacheBytes = ((ByteArrayOutputStream) cache).toByteArray();
                // Encourage GC
                cache = null;
            }
        } catch (Exception i) {
            log.error("Hashing: " + url + "@" + header.getOffset(), i);
        }
    }

    /**
     * 
     * @return
     */
    public String getHash() {
        return hash;
    }

    /**
     * This returns the content. {@link #cleanup()} should be called after use as this avoids a build-up of
     * temporary files.
     * @return a {@link InputStream#mark(int)}-capable InputStream with the content given in the constructor.
     */
    public InputStream getInputStream() {
        if (inMemory) {
            if (this.cacheBytes != null) {
                return new ByteArrayInputStream(this.cacheBytes);
            } else {
                log.error("Found a NULL byte array!");
                return new ByteArrayInputStream(new byte[] {});
            }
        } else {
            try {
                RAFcache = new RandomAccessFile(cacheFile, "r");
            } catch (FileNotFoundException e) {
                e.printStackTrace();
                return null;
            }
            return new RandomAccessFileInputStream(RAFcache);
        }
    }

    /**
     * 
     * @return
     */
    public boolean isTruncated() {
        return truncated;
    }

    /**
     * Closes all references to the cache file (if any) and deletes the file.
     * 
     * Failure to call this method after use of the content will lead to a build-up of temporary files for as
     * long as the JVM is running.
     */
    public void cleanup() {
        if (RAFcache != null) {
            try {
                RAFcache.close();
            } catch (Exception e) {
                log.warn("Exception closing RandomAccessFile cache for " + cacheFile + " for " + url, e);
            }
        }

        if (this.cacheFile != null && cacheFile.exists()) {
            try {
                if (!this.cacheFile.delete()) {
                    log.warn("Unable to delete " + cacheFile + " for " + url);
                }
            } catch (Exception e) {
                log.warn("Exception deleting " + cacheFile + " for " + url, e);
            }
        }
    }

}