Java tutorial
/* File: $Id$ * Revision: $Revision$ * Author: $Author$ * Date: $Date$ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2012 The Royal Danish Library, the Danish State and * University Library, the National Library of France and the Austrian * National Library. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package dk.netarkivet.common.utils.warc; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.archive.io.ArchiveRecord; import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCReaderFactory; import org.archive.io.warc.WARCRecord; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.NetarkivetException; import dk.netarkivet.common.utils.batch.FileBatchJob; import dk.netarkivet.common.utils.batch.WARCBatchFilter; /** * Abstract class defining a batch job to run on a set of WARC files. * Each implementation is required to define initialize() , processRecord() and * finish() methods. The bitarchive application then ensures that the batch * job run initialize(), runs processRecord() on each record in each file in * the archive, and then runs finish(). */ public abstract class WARCBatchJob extends FileBatchJob { /** The total number of records processed. */ protected int noOfRecordsProcessed = 0; /** * Initialize the job before running. * This is called before the processRecord() calls start coming. * @param os The OutputStream to which output data is written */ public abstract void initialize(OutputStream os); /** * Exceptions should be handled with the handleException() method. * @param os The OutputStream to which output data is written * @param record the object to be processed. */ public abstract void processRecord(WARCRecord record, OutputStream os); /** * Finish up the job. * This is called after the last processRecord() call. * @param os The OutputStream to which output data is written */ public abstract void finish(OutputStream os); /** * returns a BatchFilter object which restricts the set of warc records * in the archive on which this batch-job is performed. The default value * is a neutral filter which allows all records. * * @return A filter telling which records should be given to * processRecord(). */ public WARCBatchFilter getFilter() { return WARCBatchFilter.NO_FILTER; } /** * Accepts only WARC and WARCGZ files. Runs through all records and calls * processRecord() on every record that is allowed by getFilter(). * Does nothing on a non-arc file. * * @param warcFile The WARC or WARCGZ file to be processed. * @param os the OutputStream to which output is to be written * @throws ArgumentNotValid if either argument is null * @return true, if file processed successful, otherwise false */ public final boolean processFile(File warcFile, OutputStream os) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(warcFile, "warcFile"); ArgumentNotValid.checkNotNull(os, "os"); Log log = LogFactory.getLog(getClass().getName()); long arcFileIndex = 0; boolean success = true; log.info("Processing WARCfile: " + warcFile.getName()); try { // This outer try-catch block catches all unexpected exceptions //Create an WARCReader and retrieve its Iterator: WARCReader warcReader = null; try { warcReader = WARCReaderFactory.get(warcFile); } catch (IOException e) { //Some IOException handleException(e, warcFile, arcFileIndex); return false; // Can't process file after exception } try { Iterator<? extends ArchiveRecord> it = warcReader.iterator(); /* Process all records from this Iterator: */ log.debug("Starting processing records in WARCfile '" + warcFile.getName() + "'."); if (!it.hasNext()) { log.debug("No WARCRecords found in WARCfile '" + warcFile.getName() + "'."); } WARCRecord record = null; while (it.hasNext()) { log.trace("At begin of processing-loop"); // Get a record from the file record = (WARCRecord) it.next(); // Process with the job try { if (!getFilter().accept(record)) { continue; } log.debug("Processing WARCRecord #" + noOfRecordsProcessed + " in WARCfile '" + warcFile.getName() + "'."); processRecord(record, os); ++noOfRecordsProcessed; } catch (NetarkivetException e) { // Our exceptions don't stop us success = false; // With our exceptions, we assume that just the // processing of this record got stopped, and we can // easily find the next handleOurException(e, warcFile, arcFileIndex); } catch (Exception e) { success = false; // Strange exceptions do stop us handleException(e, warcFile, arcFileIndex); // With strange exceptions, we don't know // if we've skipped records break; } // Close the record try { // TODO maybe this works, maybe not... long arcRecordOffset = record.getHeader().getContentBegin() + record.getHeader().getLength(); record.close(); arcFileIndex = arcRecordOffset; } catch (IOException ioe) { // Couldn't close an WARCRecord success = false; handleException(ioe, warcFile, arcFileIndex); // If close fails, we don't know if we've skipped // records break; } log.trace("At end of processing-loop"); } } finally { try { warcReader.close(); } catch (IOException e) { //Some IOException // TODO Discuss whether exceptions on close cause // filesFailed addition handleException(e, warcFile, arcFileIndex); } } } catch (Exception unexpectedException) { handleException(unexpectedException, warcFile, arcFileIndex); return false; } return success; } /** * Private method that handles our exception. * @param e the given exception * @param warcFile The WARC File where the exception occurred. * @param index The offset in the WARC File where the exception occurred. */ private void handleOurException(NetarkivetException e, File warcFile, long index) { handleException(e, warcFile, index); } /** * When the org.archive.io.arc classes throw IOExceptions while reading, * this is where they go. Subclasses are welcome to override the default * functionality which simply logs and records them in a list. * TODO Actually use the warcfile/index entries in the exception list * * @param e An Exception thrown by the org.archive.io.arc classes. * @param warcfile The arcFile that was processed while the Exception * was thrown * @param index The index (in the WARC file) at which the Exception * was thrown * @throws ArgumentNotValid if e is null */ public void handleException(Exception e, File warcfile, long index) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(e, "e"); Log log = LogFactory.getLog(getClass().getName()); log.debug("Caught exception while running batch job " + "on file " + warcfile + ", position " + index + ":\n" + e.getMessage(), e); addException(warcfile, index, ExceptionOccurrence.UNKNOWN_OFFSET, e); } /** * Returns a representation of the list of Exceptions recorded for this * WARC batch job. * If called by a subclass, a method overriding handleException() * should always call super.handleException(). * * @return All Exceptions passed to handleException so far. */ public Exception[] getExceptionArray() { List<ExceptionOccurrence> exceptions = getExceptions(); Exception[] exceptionList = new Exception[exceptions.size()]; int i = 0; for (ExceptionOccurrence e : exceptions) { exceptionList[i++] = e.getException(); } return exceptionList; } /** * * @return the number of records processed. */ public int noOfRecordsProcessed() { return noOfRecordsProcessed; } }