org.archive.crawler.framework.CheckpointService.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.framework.CheckpointService.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.framework;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.comparator.LastModifiedFileComparator;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.reporting.CrawlStatSnapshot;
import org.archive.spring.ConfigPath;
import org.archive.spring.ConfigPathConfigurer;
import org.archive.spring.HasValidator;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.Lifecycle;
import org.springframework.context.support.AbstractApplicationContext;
import org.springframework.validation.Validator;

/**
 * Executes checkpoints, and offers convenience methods for enumerating 
 * available Checkpoints and injecting a recovery-Checkpoint after 
 * build and before launch (setRecoveryCheckpointByName).
 * 
 * Offers optional automatic checkpointing at a configurable interval 
 * in minutes. 
 * 
 * @contributor stack
 * @contributor gojomo
 * @contributor pjack
 */
public class CheckpointService implements Lifecycle, ApplicationContextAware, HasValidator {
    private final static Logger LOGGER = Logger.getLogger(CheckpointService.class.getName());

    /** Next overall series checkpoint number */
    protected int nextCheckpointNumber = 1;

    protected Checkpoint checkpointInProgress;

    protected Checkpoint lastCheckpoint;

    protected CrawlStatSnapshot lastCheckpointSnapshot = null;

    /** service for auto-checkpoint tasks at an interval */
    protected Timer timer = new Timer(true);
    protected TimerTask checkpointTask = null;
    protected ConfigPath checkpointsDir = new ConfigPath("checkpoints subdirectory", "checkpoints");

    public ConfigPath getCheckpointsDir() {
        return checkpointsDir;
    }

    /**
     * Checkpoints directory
     */
    public void setCheckpointsDir(ConfigPath checkpointsDir) {
        this.checkpointsDir = checkpointsDir;
    }

    protected long checkpointIntervalMinutes = -1;

    public long getCheckpointIntervalMinutes() {
        return checkpointIntervalMinutes;
    }

    /**
     * Period at which to create automatic checkpoints; -1 means
     * no auto checkpointing. 
     */
    public void setCheckpointIntervalMinutes(long interval) {
        long oldVal = checkpointIntervalMinutes;
        this.checkpointIntervalMinutes = interval;
        if (checkpointIntervalMinutes != oldVal) {
            setupCheckpointTask();
        }
    }

    protected boolean forgetAllButLatest = false;

    public boolean getForgetAllButLatest() {
        return forgetAllButLatest;
    }

    /**
     * True to save only the latest checkpoint, false to save all of them.
     * Default is false.
     */
    public void setForgetAllButLatest(boolean forgetAllButLatest) {
        boolean oldVal = this.forgetAllButLatest;
        this.forgetAllButLatest = forgetAllButLatest;
        if (this.forgetAllButLatest != oldVal) {
            setupCheckpointTask();
        }
    }

    protected Checkpoint recoveryCheckpoint;

    @Autowired(required = false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
        checkpoint.getCheckpointDir().setBase(getCheckpointsDir());
    }

    public Checkpoint getRecoveryCheckpoint() {
        return this.recoveryCheckpoint;
    }

    protected CrawlController controller;

    public CrawlController getCrawlController() {
        return this.controller;
    }

    @Autowired
    public void setCrawlController(CrawlController controller) {
        this.controller = controller;
    }

    // ApplicationContextAware implementation, for eventing
    protected AbstractApplicationContext appCtx;

    public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
        this.appCtx = (AbstractApplicationContext) applicationContext;
    }

    /**
     * Create a new Checkpointer
     */
    public CheckpointService() {
    }

    public synchronized void start() {
        if (isRunning) {
            return;
        }
        // report if checkpoint incomplete/invalid
        if (getRecoveryCheckpoint() != null) {
            File cpDir = getRecoveryCheckpoint().getCheckpointDir().getFile();
            if (!Checkpoint.hasValidStamp(cpDir)) {
                LOGGER.severe(
                        "checkpoint '" + cpDir.getAbsolutePath() + "' missing validity stamp file; checkpoint data "
                                + "may be missing or otherwise corrupt.");
            }
            this.lastCheckpoint = getRecoveryCheckpoint();
            String serial = getRecoveryCheckpoint().getShortName().substring(2);
            try {
                Number lastCheckpointNumber = Checkpoint.INDEX_FORMAT.parse(serial);
                this.nextCheckpointNumber = lastCheckpointNumber.intValue() + 1;
            } catch (ParseException e) {
                LOGGER.warning("failed to parse serial from " + lastCheckpoint.getShortName() + " - " + e);
            }
        }
        this.isRunning = true;
        setupCheckpointTask();
    }

    /**
     * Setup checkpointTask according to current interval. (An already-scheduled
     * task, if any, is canceled.)
     */
    protected synchronized void setupCheckpointTask() {
        if (checkpointTask != null) {
            checkpointTask.cancel();
        }
        if (!isRunning) {
            // don't setup before start (or after finish), even if
            // triggered by interval change
            return;
        }
        // Convert period from minutes to milliseconds.
        long periodMs = getCheckpointIntervalMinutes() * (60L * 1000L);
        if (periodMs <= 0) {
            return;
        }
        checkpointTask = new TimerTask() {
            public void run() {
                if (isCheckpointing()) {
                    LOGGER.info("CheckpointTimerThread skipping checkpoint, " + "already checkpointing: State: "
                            + controller.getState());
                    return;
                }
                LOGGER.info("TimerThread request checkpoint");
                requestCrawlCheckpoint();
            }
        };
        this.timer.schedule(checkpointTask, periodMs, periodMs);
        LOGGER.info("Installed Checkpoint TimerTask to checkpoint every " + periodMs + " milliseconds.");
    }

    protected boolean isRunning = false;

    public synchronized boolean isRunning() {
        return isRunning;
    }

    public synchronized void stop() {
        LOGGER.info("Cleaned up Checkpoint TimerThread.");
        this.timer.cancel();
        this.isRunning = false;
    }

    /**
     * @return Returns the nextCheckpoint index.
     */
    public int getNextCheckpointNumber() {
        return this.nextCheckpointNumber;
    }

    /**
     * Run a checkpoint of the crawler
     */
    public synchronized String requestCrawlCheckpoint() throws IllegalStateException {
        if (isCheckpointing()) {
            throw new IllegalStateException("Checkpoint already running.");
        }

        // prevent redundant auto-checkpoints when crawler paused or stopping
        if (controller.isPaused() || controller.getState().equals(CrawlController.State.STOPPING)) {
            if (controller.getStatisticsTracker().getSnapshot().sameProgressAs(lastCheckpointSnapshot)) {
                LOGGER.info("no progress since last checkpoint; ignoring");
                System.err.println("no progress since last checkpoint; ignoring");
                return null;
            }
        }

        long checkpointStart = System.currentTimeMillis();
        Map<String, Checkpointable> toCheckpoint = appCtx.getBeansOfType(Checkpointable.class);
        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("checkpointing beans " + toCheckpoint);
        }

        checkpointInProgress = new Checkpoint();
        try {
            checkpointInProgress.setForgetAllButLatest(getForgetAllButLatest());
            checkpointInProgress.generateFrom(getCheckpointsDir(), getNextCheckpointNumber());

            // pre (incl. acquire necessary locks)
            long startStart = System.currentTimeMillis();
            for (Checkpointable c : toCheckpoint.values()) {
                c.startCheckpoint(checkpointInProgress);
            }
            LOGGER.info("all startCheckpoint() completed in " + (System.currentTimeMillis() - startStart) + "ms");

            // flush/write
            long doStart = System.currentTimeMillis();
            for (Checkpointable c : toCheckpoint.values()) {
                long doMs = System.currentTimeMillis();
                c.doCheckpoint(checkpointInProgress);
                long doDuration = System.currentTimeMillis() - doMs;
                LOGGER.fine("doCheckpoint() " + c + " in " + doDuration + "ms");
            }
            LOGGER.info("all doCheckpoint() completed in " + (System.currentTimeMillis() - doStart) + "ms");

            if (getForgetAllButLatest() && lastCheckpoint != null) {
                try {
                    long deleteStart = System.currentTimeMillis();
                    FileUtils.deleteDirectory(lastCheckpoint.getCheckpointDir().getFile());
                    lastCheckpoint = null;
                    LOGGER.info("deleted old checkpoint in " + (System.currentTimeMillis() - deleteStart) + "ms");
                } catch (IOException e) {
                    LOGGER.log(Level.SEVERE, "problem deleting last checkpoint directory "
                            + lastCheckpoint.getCheckpointDir().getFile(), e);
                }
            }

            checkpointInProgress.setSuccess(true);

            appCtx.publishEvent(new CheckpointSuccessEvent(this, checkpointInProgress));
        } catch (Exception e) {
            checkpointFailed(e);
        } finally {
            checkpointInProgress.writeValidity(controller.getStatisticsTracker().getProgressStamp());
            lastCheckpointSnapshot = controller.getStatisticsTracker().getSnapshot();
            // close (incl. release locks)
            long finishStart = System.currentTimeMillis();
            for (Checkpointable c : toCheckpoint.values()) {
                c.finishCheckpoint(checkpointInProgress);
            }
            LOGGER.info("all finishCheckpoint() completed in " + (System.currentTimeMillis() - finishStart) + "ms");
        }
        LOGGER.info("completed checkpoint " + checkpointInProgress.getName() + " in "
                + (System.currentTimeMillis() - checkpointStart) + "ms");

        this.nextCheckpointNumber++;
        String nameToReport = checkpointInProgress.getSuccess() ? checkpointInProgress.getName() : null;
        this.lastCheckpoint = this.checkpointInProgress;
        this.checkpointInProgress = null;
        return nameToReport;
    }

    /**
     * @return True if a checkpoint is in progress.
     */
    public boolean isCheckpointing() {
        return this.checkpointInProgress != null;
    }

    /**
     * Note that a checkpoint failed
     *
     * @param e Exception checkpoint failed on.
     */
    protected void checkpointFailed(Exception e) {
        LOGGER.log(Level.SEVERE, " Checkpoint failed", e);
    }

    protected void checkpointFailed(final String message) {
        LOGGER.warning(message);
    }

    public boolean hasAvailableCheckpoints() {
        if (getRecoveryCheckpoint() != null || isRunning()) {
            return false;
        }
        return (findAvailableCheckpointDirectories() != null && findAvailableCheckpointDirectories().size() > 0);
    }

    /**
     * Returns a list of available, valid (contains 'valid' file) 
     * checkpoint directories, as File instances, with the more 
     * recently-written appearing first. 
     * 
     * @return List of valid checkpoint directory File instances
     */
    @SuppressWarnings("unchecked")
    public List<File> findAvailableCheckpointDirectories() {
        File[] dirs = getCheckpointsDir().getFile().listFiles((FileFilter) FileFilterUtils.directoryFileFilter());
        if (dirs == null) {
            return Collections.EMPTY_LIST;
        }
        Arrays.sort(dirs, LastModifiedFileComparator.LASTMODIFIED_REVERSE);
        LinkedList<File> dirsList = new LinkedList<File>(Arrays.asList(dirs));
        Iterator<File> iter = dirsList.iterator();
        while (iter.hasNext()) {
            File cpDir = iter.next();
            if (!Checkpoint.hasValidStamp(cpDir)) {
                LOGGER.warning("checkpoint '" + cpDir + "' missing validity stamp file; ignoring");
                iter.remove();
            }
        }
        return dirsList;
    }

    /**
     * Given the name of a valid checkpoint subdirectory in the checkpoints
     * directory, create a Checkpoint instance, and insert it into all 
     * Checkpointable beans. 
     * 
     * @param selectedCheckpoint
     */
    public synchronized void setRecoveryCheckpointByName(String selectedCheckpoint) {
        if (isRunning) {
            throw new RuntimeException("may not set recovery Checkpoint after launch");
        }
        Checkpoint recoveryCheckpoint = new Checkpoint();
        recoveryCheckpoint.getCheckpointDir().setBase(getCheckpointsDir());
        recoveryCheckpoint.getCheckpointDir().setPath(selectedCheckpoint);
        recoveryCheckpoint.getCheckpointDir().setConfigurer(appCtx.getBean(ConfigPathConfigurer.class));
        recoveryCheckpoint.afterPropertiesSet();
        setRecoveryCheckpoint(recoveryCheckpoint);
        Map<String, Checkpointable> toSetRecovery = appCtx.getBeansOfType(Checkpointable.class);

        for (Checkpointable c : toSetRecovery.values()) {
            c.setRecoveryCheckpoint(recoveryCheckpoint);
        }
    }

    protected static Validator VALIDATOR = new CheckpointValidator();

    @Override
    public Validator getValidator() {
        return VALIDATOR;
    }
} //EOC