dk.netarkivet.common.utils.archive.ArchiveBatchJob.java Source code

Introduction

Here is the source code for dk.netarkivet.common.utils.archive.ArchiveBatchJob.java
Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package dk.netarkivet.common.utils.archive;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.NetarkivetException;
import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;

/**
 * Abstract class defining a batch job to run on a set of ARC/WARC files.
 * Each implementation is required to define initialize() , processRecord() and
 * finish() methods. The bitarchive application then ensures that the batch
 * job runs initialize(), runs processRecord() on each record in each file in
 * the archive, and then runs finish().
 */
public abstract class ArchiveBatchJob extends ArchiveBatchJobBase {

    /**
     * Exceptions should be handled with the handleException() method.
     * @param os The OutputStream to which output data is written
     * @param record the object to be processed.
     */
    public abstract void processRecord(ArchiveRecordBase record, OutputStream os);

    /**
     * Returns an ArchiveBatchFilter object which restricts the set of records in the 
     * archive on which this batch-job is performed. The default value is
     * a neutral filter which allows all records.
     *
     * @return A filter telling which records should be given to processRecord().
     */
    public ArchiveBatchFilter getFilter() {
        return ArchiveBatchFilter.NO_FILTER;
    }

    /**
     * Accepts only arc(.gz) and warc(.gz) files. Runs through all records and calls
     * processRecord() on every record that is allowed by getFilter().
     * Does nothing on a non-(w)arc file.
     *
     * @param archiveFile The arc(.gz) or warc(.gz) file to be processed.
     * @param os the OutputStream to which output is to be written
     * @throws ArgumentNotValid if either argument is null
     * @return true, if file processed successful, otherwise false
     */
    public final boolean processFile(File archiveFile, OutputStream os) throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(archiveFile, "archiveFile");
        ArgumentNotValid.checkNotNull(os, "os");
        Log log = LogFactory.getLog(getClass().getName());
        long arcFileIndex = 0;
        boolean success = true;
        log.info("Processing archive file: " + archiveFile.getName());

        try { // This outer try-catch block catches all unexpected exceptions
              //Create an ArchiveReader and retrieve its Iterator:
            ArchiveReader archiveReader = null;

            try {
                archiveReader = ArchiveReaderFactory.get(archiveFile);
            } catch (IOException e) { //Some IOException
                handleException(e, archiveFile, arcFileIndex);

                return false; // Can't process file after exception
            }

            try {
                Iterator<? extends ArchiveRecord> it = archiveReader.iterator();
                /* Process all records from this Iterator: */
                log.debug("Starting processing records in archive file '" + archiveFile.getName() + "'.");
                if (!it.hasNext()) {
                    log.debug("No records found in archive file '" + archiveFile.getName() + "'.");
                }
                ArchiveRecord archiveRecord = null;
                ArchiveRecordBase record;
                while (it.hasNext()) {
                    log.trace("At begin of processing-loop");
                    // Get a record from the file
                    archiveRecord = (ArchiveRecord) it.next();
                    record = ArchiveRecordBase.wrapArchiveRecord(archiveRecord);
                    // Process with the job
                    try {
                        if (!getFilter().accept(record)) {
                            continue;
                        }
                        log.debug("Processing record #" + noOfRecordsProcessed + " in archive file '"
                                + archiveFile.getName() + "'.");
                        processRecord(record, os);
                        ++noOfRecordsProcessed;
                    } catch (NetarkivetException e) {
                        // Our exceptions don't stop us
                        success = false;

                        // With our exceptions, we assume that just the
                        // processing of this record got stopped, and we can
                        // easily find the next
                        handleOurException(e, archiveFile, arcFileIndex);
                    } catch (Exception e) {
                        success = false; // Strange exceptions do stop us

                        handleException(e, archiveFile, arcFileIndex);
                        // With strange exceptions, we don't know
                        // if we've skipped records
                        break;
                    }
                    // Close the record
                    try {
                        /*
                         // FIXME: Don't know  how to compute this for warc-files
                         // computation for arc-files: long arcRecordOffset =
                         //        record.getBodyOffset() + record.getMetaData().getLength();
                         // computation for warc-files (experimental)
                         long arcRecordOffset = record.getHeader().getOffset();
                         */
                        // TODO maybe this works, maybe not...
                        long arcRecordOffset = archiveRecord.getHeader().getContentBegin()
                                + archiveRecord.getHeader().getLength();
                        archiveRecord.close();
                        arcFileIndex = arcRecordOffset;
                    } catch (IOException ioe) { // Couldn't close an WARCRecord
                        success = false;

                        handleException(ioe, archiveFile, arcFileIndex);
                        // If close fails, we don't know if we've skipped
                        // records
                        break;
                    }
                    log.trace("At end of processing-loop");
                }
            } finally {
                try {
                    archiveReader.close();
                } catch (IOException e) { //Some IOException
                    // TODO Discuss whether exceptions on close cause
                    // filesFailed addition
                    handleException(e, archiveFile, arcFileIndex);
                }
            }
        } catch (Exception unexpectedException) {
            handleException(unexpectedException, archiveFile, arcFileIndex);
            return false;
        }
        return success;
    }

}