Java tutorial
/* $Id$ * $Date$ * $Revision$ * $Author$ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2012 The Royal Danish Library, the Danish State and * University Library, the National Library of France and the Austrian * National Library. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package dk.netarkivet.common.utils.arc; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.NetarkivetException; import dk.netarkivet.common.utils.batch.ARCBatchFilter; import dk.netarkivet.common.utils.batch.FileBatchJob; /** * Abstract class defining a batch job to run on a set of ARC files. * Each implementation is required to define initialize() , processRecord() and * finish() methods. The bitarchive application then ensures that the batch * job run initialize(), runs processRecord() on each record in each file in * the archive, and then runs finish(). */ public abstract class ARCBatchJob extends FileBatchJob { /** The total number of records processed. */ protected int noOfRecordsProcessed = 0; /** * Initialize the job before running. * This is called before the processRecord() calls start coming. * @param os The OutputStream to which output data is written */ @Override public abstract void initialize(OutputStream os); /** * Exceptions should be handled with the handleException() method. * @param os The OutputStream to which output data is written * @param record the object to be processed. */ public abstract void processRecord(ARCRecord record, OutputStream os); /** * Finish up the job. * This is called after the last processRecord() call. * @param os The OutputStream to which output data is written */ @Override public abstract void finish(OutputStream os); /** * returns a BatchFilter object which restricts the set of arcrecords in the * archive on which this batch-job is performed. The default value is * a neutral filter which allows all records. * * @return A filter telling which records should be given to * processRecord(). */ public ARCBatchFilter getFilter() { return ARCBatchFilter.NO_FILTER; } /** * Accepts only ARC and ARCGZ files. Runs through all records and calls * processRecord() on every record that is allowed by getFilter(). * Does nothing on a non-arc file. * * @param arcFile The ARC or ARCGZ file to be processed. * @param os the OutputStream to which output is to be written * @throws ArgumentNotValid if either argument is null * @return true, if file processed successful, otherwise false */ @Override public final boolean processFile(File arcFile, OutputStream os) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(arcFile, "arcFile"); ArgumentNotValid.checkNotNull(os, "os"); Log log = LogFactory.getLog(getClass().getName()); long arcFileIndex = 0; boolean success = true; log.info("Processing ARCfile: " + arcFile.getName()); try { // This outer try-catch block catches all unexpected exceptions //Create an ARCReader and retrieve its Iterator: ARCReader arcReader = null; try { arcReader = ARCReaderFactory.get(arcFile); } catch (IOException e) { //Some IOException handleException(e, arcFile, arcFileIndex); return false; // Can't process file after exception } try { Iterator<? extends ArchiveRecord> it = arcReader.iterator(); /* Process all records from this Iterator: */ log.debug("Starting processing records in ARCfile '" + arcFile.getName() + "'."); if (!it.hasNext()) { log.debug("No ARCRecords found in ARCfile '" + arcFile.getName() + "'."); } ARCRecord record = null; while (it.hasNext()) { log.trace("At begin of processing-loop"); // Get a record from the file record = (ARCRecord) it.next(); // Process with the job try { if (!getFilter().accept(record)) { continue; } log.debug("Processing ARCRecord #" + noOfRecordsProcessed + " in ARCfile '" + arcFile.getName() + "'."); processRecord(record, os); ++noOfRecordsProcessed; } catch (NetarkivetException e) { // Our exceptions don't stop us success = false; // With our exceptions, we assume that just the // processing of this record got stopped, and we can // easily find the next handleOurException(e, arcFile, arcFileIndex); } catch (Exception e) { success = false; // Strange exceptions do stop us handleException(e, arcFile, arcFileIndex); // With strange exceptions, we don't know // if we've skipped records break; } // Close the record try { long arcRecordOffset = record.getBodyOffset() + record.getMetaData().getLength(); record.close(); arcFileIndex = arcRecordOffset; } catch (IOException ioe) { // Couldn't close an ARCRecord success = false; handleException(ioe, arcFile, arcFileIndex); // If close fails, we don't know if we've skipped // records break; } log.trace("At end of processing-loop"); } } finally { try { arcReader.close(); } catch (IOException e) { //Some IOException // TODO Discuss whether exceptions on close cause // filesFailed addition handleException(e, arcFile, arcFileIndex); } } } catch (Exception unexpectedException) { handleException(unexpectedException, arcFile, arcFileIndex); return false; } return success; } /** * Private method that handles our exception. * @param e the given exception * @param arcFile The ARCFile where the exception occurred. * @param index The offset in the ARCFile where the exception occurred. */ private void handleOurException(NetarkivetException e, File arcFile, long index) { handleException(e, arcFile, index); } /** * When the org.archive.io.arc classes throw IOExceptions while reading, * this is where they go. Subclasses are welcome to override the default * functionality which simply logs and records them in a list. * TODO Actually use the arcfile/index entries in the exception list * * @param e An Exception thrown by the org.archive.io.arc classes. * @param arcfile The arcFile that was processed while the Exception * was thrown * @param index The index (in the ARC file) at which the Exception * was thrown * @throws ArgumentNotValid if e is null */ public void handleException(Exception e, File arcfile, long index) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(e, "e"); Log log = LogFactory.getLog(getClass().getName()); log.debug("Caught exception while running batch job " + "on file " + arcfile + ", position " + index + ":\n" + e.getMessage(), e); addException(arcfile, index, ExceptionOccurrence.UNKNOWN_OFFSET, e); } /** * Returns a representation of the list of Exceptions recorded for this * ARC batch job. * If called by a subclass, a method overriding handleException() * should always call super.handleException(). * * @return All Exceptions passed to handleException so far. */ public Exception[] getExceptionArray() { List<ExceptionOccurrence> exceptions = getExceptions(); Exception[] exceptionList = new Exception[exceptions.size()]; int i = 0; for (ExceptionOccurrence e : exceptions) { exceptionList[i++] = e.getException(); } return exceptionList; } /** * * @return the number of records processed. */ public int noOfRecordsProcessed() { return noOfRecordsProcessed; } }