org.archive.crawler.reporting.CrawlerLoggerModule.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.reporting.CrawlerLoggerModule.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.reporting;

import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.FileHandler;
import java.util.logging.Formatter;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;

import org.apache.commons.httpclient.URIException;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.io.NonFatalErrorFormatter;
import org.archive.crawler.io.RuntimeErrorFormatter;
import org.archive.crawler.io.StatisticsLogFormatter;
import org.archive.crawler.io.UriErrorFormatter;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.crawler.util.Logs;
import org.archive.io.GenerationFileHandler;
import org.archive.modules.SimpleFileLoggerProvider;
import org.archive.modules.extractor.UriErrorLoggerModule;
import org.archive.net.UURI;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/**
 * Module providing all expected whole-crawl logging facilities
 * 
 * @contributor pjack
 * @contributor gojomo
 */
public class CrawlerLoggerModule implements UriErrorLoggerModule, Lifecycle, InitializingBean, Checkpointable,
        SimpleFileLoggerProvider, DisposableBean {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    protected ConfigPath path = new ConfigPath(Engine.LOGS_DIR_NAME, "${launchId}/logs");

    public ConfigPath getPath() {
        return path;
    }

    public void setPath(ConfigPath cp) {
        this.path.merge(cp);
    }

    /**
     * Whether to include the "extra info" field for each entry in crawl.log.
     * "Extra info" is arbitrary JSON. It is the last field of the log line.
     */
    protected boolean logExtraInfo = false;

    public boolean getLogExtraInfo() {
        return logExtraInfo;
    }

    public void setLogExtraInfo(boolean logExtraInfo) {
        this.logExtraInfo = logExtraInfo;
    }

    // manifest support
    /** abbreviation label for config files in manifest */
    public static final char MANIFEST_CONFIG_FILE = 'C';
    /** abbreviation label for report files in manifest */
    public static final char MANIFEST_REPORT_FILE = 'R';
    /** abbreviation label for log files in manifest */
    public static final char MANIFEST_LOG_FILE = 'L';

    // key log names
    private static final String LOGNAME_CRAWL = "crawl";
    private static final String LOGNAME_ALERTS = "alerts";
    private static final String LOGNAME_PROGRESS_STATISTICS = "progress-statistics";
    private static final String LOGNAME_URI_ERRORS = "uri-errors";
    private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";
    private static final String LOGNAME_NONFATAL_ERRORS = "nonfatal-errors";

    protected ConfigPath crawlLogPath = new ConfigPath(Logs.CRAWL.getFilename(), Logs.CRAWL.getFilename());

    public ConfigPath getCrawlLogPath() {
        return crawlLogPath;
    }

    public void setCrawlLogPath(ConfigPath cp) {
        this.crawlLogPath.merge(cp);
    }

    protected ConfigPath alertsLogPath = new ConfigPath(Logs.ALERTS.getFilename(), Logs.ALERTS.getFilename());

    public ConfigPath getAlertsLogPath() {
        return alertsLogPath;
    }

    public void setAlertsLogPath(ConfigPath cp) {
        this.alertsLogPath.merge(cp);
    }

    protected ConfigPath progressLogPath = new ConfigPath(Logs.PROGRESS_STATISTICS.getFilename(),
            Logs.PROGRESS_STATISTICS.getFilename());

    public ConfigPath getProgressLogPath() {
        return progressLogPath;
    }

    public void setProgressLogPath(ConfigPath cp) {
        this.progressLogPath.merge(cp);
    }

    protected ConfigPath uriErrorsLogPath = new ConfigPath(Logs.URI_ERRORS.getFilename(),
            Logs.URI_ERRORS.getFilename());

    public ConfigPath getUriErrorsLogPath() {
        return uriErrorsLogPath;
    }

    public void setUriErrorsLogPath(ConfigPath cp) {
        this.uriErrorsLogPath.merge(cp);
    }

    protected ConfigPath runtimeErrorsLogPath = new ConfigPath(Logs.RUNTIME_ERRORS.getFilename(),
            Logs.RUNTIME_ERRORS.getFilename());

    public ConfigPath getRuntimeErrorsLogPath() {
        return runtimeErrorsLogPath;
    }

    public void setRuntimeErrorsLogPath(ConfigPath cp) {
        this.runtimeErrorsLogPath.merge(cp);
    }

    protected ConfigPath nonfatalErrorsLogPath = new ConfigPath(Logs.NONFATAL_ERRORS.getFilename(),
            Logs.NONFATAL_ERRORS.getFilename());

    public ConfigPath getNonfatalErrorsLogPath() {
        return nonfatalErrorsLogPath;
    }

    public void setNonfatalErrorsLogPath(ConfigPath cp) {
        this.nonfatalErrorsLogPath.merge(cp);
    }

    /** suffix to use on active logs */
    //    public static final String CURRENT_LOG_SUFFIX = ".log";

    /**
     * Crawl progress logger.
     *
     * No exceptions.  Logs summary result of each url processing.
     */
    private transient Logger uriProcessing;

    /**
     * This logger contains unexpected runtime errors.
     *
     * Would contain errors trying to set up a job or failures inside
     * processors that they are not prepared to recover from.
     */
    private transient Logger runtimeErrors;

    /**
     * This logger is for job-scoped logging, specifically recoverable 
     * errors which happen and are handled within a particular processor.
     *
     * Examples would be socket timeouts, exceptions thrown by 
     * extractors, etc.
     */
    private transient Logger nonfatalErrors;

    /**
     * Special log for URI format problems, wherever they may occur.
     */
    private transient Logger uriErrors;

    /**
     * Statistics tracker writes here at regular intervals.
     */
    private transient Logger progressStats;

    /**
     * Record of fileHandlers established for loggers,
     * assisting file rotation.
     */
    transient private Map<Logger, FileHandler> fileHandlers;

    private StringBuffer manifest = new StringBuffer();

    private transient AlertThreadGroup atg;

    public CrawlerLoggerModule() {

    }

    public void start() {
        if (isRunning) {
            return;
        }
        this.atg = AlertThreadGroup.current();
        try {
            FileUtils.ensureWriteableDirectory(getPath().getFile());
            setupLogs();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
        isRunning = true;
    }

    protected boolean isRunning = false;

    public boolean isRunning() {
        return this.isRunning;
    }

    public void stop() {
        isRunning = false;
    }

    public void destroy() {
        closeLogFiles();
    }

    protected void setupLogs() throws IOException {
        String logsPath = getPath().getFile().getAbsolutePath() + File.separatorChar;
        uriProcessing = Logger.getLogger(LOGNAME_CRAWL + "." + logsPath);
        runtimeErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS + "." + logsPath);
        nonfatalErrors = Logger.getLogger(LOGNAME_NONFATAL_ERRORS + "." + logsPath);
        uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
        progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." + logsPath);

        this.fileHandlers = new HashMap<Logger, FileHandler>();
        setupLogFile(uriProcessing, getCrawlLogPath().getFile().getAbsolutePath(),
                new UriProcessingFormatter(getLogExtraInfo()), true);

        setupLogFile(runtimeErrors, getRuntimeErrorsLogPath().getFile().getAbsolutePath(),
                new RuntimeErrorFormatter(getLogExtraInfo()), true);

        setupLogFile(nonfatalErrors, getNonfatalErrorsLogPath().getFile().getAbsolutePath(),
                new NonFatalErrorFormatter(getLogExtraInfo()), true);

        setupLogFile(uriErrors, getUriErrorsLogPath().getFile().getAbsolutePath(), new UriErrorFormatter(), true);

        setupLogFile(progressStats, getProgressLogPath().getFile().getAbsolutePath(), new StatisticsLogFormatter(),
                true);

        setupAlertLog(logsPath);
    }

    private void setupLogFile(Logger logger, String filename, Formatter f, boolean shouldManifest)
            throws IOException, SecurityException {
        logger.setLevel(Level.INFO); // set all standard loggers to INFO
        GenerationFileHandler fh = GenerationFileHandler.makeNew(filename, false, shouldManifest);
        fh.setFormatter(f);
        logger.addHandler(fh);
        addToManifest(filename, MANIFEST_LOG_FILE, shouldManifest);
        logger.setUseParentHandlers(false);
        this.fileHandlers.put(logger, fh);
    }

    public Logger setupSimpleLog(String logName) {
        Logger logger = Logger.getLogger(logName + ".log");

        Formatter f = new Formatter() {
            public String format(java.util.logging.LogRecord record) {
                return ArchiveUtils.getLog17Date(record.getMillis()) + " " + record.getMessage() + '\n';
            }
        };

        ConfigPath logPath = new ConfigPath(logName + ".log", logName + ".log");
        logPath.setBase(getPath());
        try {
            setupLogFile(logger, logPath.getFile().getAbsolutePath(), f, true);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }

        return logger;
    }

    private void setupAlertLog(String logsPath) throws IOException {
        Logger logger = Logger.getLogger(LOGNAME_ALERTS + "." + logsPath);
        String filename = getAlertsLogPath().getFile().getAbsolutePath();
        GenerationFileHandler fh = GenerationFileHandler.makeNew(filename, false, true);
        fh.setFormatter(new SimpleFormatter());
        AlertThreadGroup.current().addLogger(logger);
        AlertHandler.ensureStaticInitialization();
        logger.addHandler(fh);
        addToManifest(filename, MANIFEST_LOG_FILE, true);
        logger.setUseParentHandlers(false);
        this.fileHandlers.put(logger, fh);
    }

    public void rotateLogFiles() throws IOException {
        rotateLogFiles("." + ArchiveUtils.get14DigitDate());
    }

    protected void rotateLogFiles(String generationSuffix) throws IOException {
        rotateLogFiles(generationSuffix, false);
    }

    protected void rotateLogFiles(String generationSuffix, boolean mergeOld) throws IOException {
        for (Logger l : fileHandlers.keySet()) {
            GenerationFileHandler gfh = (GenerationFileHandler) fileHandlers.get(l);
            GenerationFileHandler newGfh = gfh.rotate(generationSuffix, "", mergeOld);

            if (gfh.shouldManifest()) {
                addToManifest((String) newGfh.getFilenameSeries().get(1), MANIFEST_LOG_FILE,
                        newGfh.shouldManifest());
            }

            l.removeHandler(gfh);
            l.addHandler(newGfh);
            fileHandlers.put(l, newGfh);
        }
    }

    /**
     * Close all log files and remove handlers from loggers.
     */
    public void closeLogFiles() {
        if (fileHandlers != null) {
            for (Logger l : fileHandlers.keySet()) {
                GenerationFileHandler gfh = (GenerationFileHandler) fileHandlers.get(l);
                gfh.close();
                l.removeHandler(gfh);
            }
        }
    }

    /**
     * Add a file to the manifest of files used/generated by the current
     * crawl.
     * 
     * TODO: Its possible for a file to be added twice if reports are
     * force generated midcrawl.  Fix.
     *
     * @param file The filename (with absolute path) of the file to add
     * @param type The type of the file
     * @param bundle Should the file be included in a typical bundling of
     *           crawler files.
     *
     * @see #MANIFEST_CONFIG_FILE
     * @see #MANIFEST_LOG_FILE
     * @see #MANIFEST_REPORT_FILE
     */
    public void addToManifest(String file, char type, boolean bundle) {
        manifest.append(type + (bundle ? "+" : "-") + " " + file + "\n");
    }

    public void startCheckpoint(Checkpoint checkpointInProgress) {
    }

    /**
     * Run checkpointing.
     * 
     * <p>Default access only to be called by Checkpointer.
     * @throws Exception
     */
    public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
        // Rotate off crawler logs.
        rotateLogFiles("." + checkpointInProgress.getName(), checkpointInProgress.getForgetAllButLatest());
    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {
    }

    protected Checkpoint recoveryCheckpoint;

    @Autowired(required = false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
    }

    public Logger getNonfatalErrors() {
        return nonfatalErrors;
    }

    public Logger getProgressStats() {
        return progressStats;
    }

    public Logger getRuntimeErrors() {
        return runtimeErrors;
    }

    public Logger getUriErrors() {
        return uriErrors;
    }

    public Logger getUriProcessing() {
        return uriProcessing;
    }

    public int getAlertCount() {
        if (atg != null) {
            return atg.getAlertCount();
        } else {
            return -1;
        }
    }

    public void resetAlertCount() {
        if (atg != null) {
            atg.resetAlertCount();
        }
    }

    /**
     * Log a URIException from deep inside other components to the crawl's
     * shared log.
     *
     * @param e URIException encountered
     * @param u CrawlURI where problem occurred
     * @param l String which could not be interpreted as URI without exception
     */
    public void logUriError(URIException e, UURI u, CharSequence l) {
        Object[] array = { u, l };
        uriErrors.log(Level.INFO, e.getMessage(), array);
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        getPath().getFile().mkdirs();
        this.atg = AlertThreadGroup.current();
        this.setupLogs();
    }

    public void afterPropertiesSet() throws Exception {
        ConfigPath[] paths = { crawlLogPath, alertsLogPath, progressLogPath, uriErrorsLogPath, runtimeErrorsLogPath,
                nonfatalErrorsLogPath };
        for (ConfigPath cp : paths) {
            if (cp.getBase() == null) {
                cp.setBase(getPath());
            }
        }
    }
}