org.archive.crawler.processor.recrawl.PersistProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.processor.recrawl.PersistProcessor.java

Source

/* PersistProcessor.java
 * 
 * Created on Feb 17, 2005
 *
 * Copyright (C) 2007 Internet Archive.
 * 
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 * 
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * 
 * Heritrix is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.crawler.processor.recrawl;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.io.CrawlerJournal;
import org.archive.util.FileUtils;
import org.archive.util.IoUtils;
import org.archive.util.OneLineSimpleLogger;
import org.archive.util.SURT;
import org.archive.util.bdbje.EnhancedEnvironment;
import org.archive.util.iterator.LineReadingIterator;

import st.ata.util.AList;

import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.collections.StoredIterator;
import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.EnvironmentConfig;

/**
 * Superclass for Processors which utilize BDB-JE for URI state
 * (including most notably history) persistence. Includes many static 
 * utility methods (including a main()). 
 * 
 * @author gojomo
 */
public abstract class PersistProcessor extends Processor {

    private static final long serialVersionUID = 1L;

    private static final Logger logger = Logger.getLogger(PersistProcessor.class.getName());

    /** name of history Database */
    public static final String URI_HISTORY_DBNAME = "uri_history";

    /**
     * @return DatabaseConfig for history Database
     */
    protected static DatabaseConfig historyDatabaseConfig() {
        DatabaseConfig dbConfig = new DatabaseConfig();
        dbConfig.setTransactional(false);
        dbConfig.setAllowCreate(true);
        dbConfig.setDeferredWrite(true);
        return dbConfig;
    }

    /**
     * Usual constructor
     * 
     * @param name
     * @param string
     */
    public PersistProcessor(String name, String string) {
        super(name, string);
    }

    /**
     * Return a preferred String key for persisting the given CrawlURI's
     * AList state. 
     * 
     * @param curi CrawlURI
     * @return String key
     */
    public String persistKeyFor(CrawlURI curi) {
        // use a case-sensitive SURT for uniqueness and sorting benefits
        return SURT.fromURI(curi.getUURI().toString(), true);
    }

    /**
     * Whether the current CrawlURI's state should be persisted (to log or
     * direct to database)
     * 
     * @param curi CrawlURI
     * @return true if state should be stored; false to skip persistence
     */
    protected boolean shouldStore(CrawlURI curi) {
        // TODO: don't store some codes, such as 304 unchanged?
        return curi.isSuccess();
    }

    /**
     * Whether the current CrawlURI's state should be loaded
     * 
     * @param curi CrawlURI
     * @return true if state should be loaded; false to skip loading
     */
    protected boolean shouldLoad(CrawlURI curi) {
        // TODO: don't load some (prereqs?)
        return true;
    }

    /**
     * Copies entries from an existing environment db to a new one. If
     * historyMap is not provided, only logs the entries that would have been 
     * copied.
     * 
     * @param sourceDir existing environment database directory
     * @param historyMap new environment db (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     */
    private static int copyPersistEnv(File sourceDir, StoredSortedMap<String, AList> historyMap)
            throws DatabaseException {
        int count = 0;

        // open the source env history DB, copying entries to target env
        EnhancedEnvironment sourceEnv = setupCopyEnvironment(sourceDir, true);
        StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
        Database sourceHistoryDB = sourceEnv.openDatabase(null, URI_HISTORY_DBNAME, historyDatabaseConfig());
        StoredSortedMap<String, AList> sourceHistoryMap = new StoredSortedMap<String, AList>(sourceHistoryDB,
                new StringBinding(), new SerialBinding<AList>(sourceClassCatalog, AList.class), true);

        Iterator<Entry<String, AList>> iter = sourceHistoryMap.entrySet().iterator();
        while (iter.hasNext()) {
            Entry<String, AList> item = iter.next();
            if (logger.isLoggable(Level.FINE)) {
                logger.fine(item.getKey() + " " + item.getValue().toPrettyString());
            }

            if (historyMap != null) {
                historyMap.put(item.getKey(), item.getValue());
            }
            count++;
        }
        StoredIterator.close(iter);
        sourceHistoryDB.close();
        sourceEnv.close();

        return count;
    }

    /**
     * Populates an environment db from a persist log. If historyMap is
     * not provided, only logs the entries that would have been populated.
     * 
     * @param persistLogReader
     *            persist log
     * @param historyMap
     *            new environment db (or null for a dry run)
     * @return number of records
     * @throws UnsupportedEncodingException
     * @throws DatabaseException
     */
    private static int populatePersistEnvFromLog(BufferedReader persistLogReader,
            StoredSortedMap<String, AList> historyMap) throws UnsupportedEncodingException, DatabaseException {
        int count = 0;

        Iterator<String> iter = new LineReadingIterator(persistLogReader);
        while (iter.hasNext()) {
            String line = iter.next();
            if (line.length() == 0) {
                continue;
            }
            String[] splits = line.split(" ");
            if (splits.length != 2) {
                logger.severe("bad line: " + line);
                continue;
            }

            AList alist = (AList) IoUtils
                    .deserializeFromByteArray(Base64.decodeBase64(splits[1].getBytes("UTF-8")));

            if (logger.isLoggable(Level.FINE)) {
                logger.fine(splits[0] + " " + alist.toPrettyString());
            }

            if (historyMap != null)
                try {
                    historyMap.put(splits[0], alist);
                } catch (Exception e) {
                    logger.log(Level.SEVERE, "caught exception after loading " + count
                            + " urls from the persist log (perhaps crawl was stopped by user?)", e);
                    IOUtils.closeQuietly(persistLogReader);

                    // seems to finish most cleanly when we return rather than throw something
                    return count;
                }

            count++;
        }
        IOUtils.closeQuietly(persistLogReader);

        return count;
    }

    /**
     * Populates a new environment db from an old environment db or a persist
     * log. If path to new environment is not provided, only logs the entries 
     * that would have been populated.
     * 
     * @param sourcePath
     *            source of old entries: can be a path to an existing
     *            environment db, or a URL or path to a persist log
     * @param envFile
     *            path to new environment db (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     * @throws IOException
     */
    public static int populatePersistEnv(String sourcePath, File envFile) throws DatabaseException, IOException {
        int count = 0;
        StoredSortedMap<String, AList> historyMap = null;
        EnhancedEnvironment targetEnv = null;
        StoredClassCatalog classCatalog = null;
        Database historyDB = null;

        if (envFile != null) {
            // set up target environment
            if (!envFile.exists()) {
                envFile.mkdirs();
            }
            targetEnv = setupCopyEnvironment(envFile);
            classCatalog = targetEnv.getClassCatalog();
            historyDB = targetEnv.openDatabase(null, URI_HISTORY_DBNAME, historyDatabaseConfig());
            historyMap = new StoredSortedMap<String, AList>(historyDB, new StringBinding(),
                    new SerialBinding<AList>(classCatalog, AList.class), true);
        }

        try {
            count = copyPersistSourceToHistoryMap(null, sourcePath, historyMap);
        } finally {
            // in finally block so that we unlock the target env even if we
            // failed to populate it
            if (envFile != null) {
                logger.info(count + " records imported from " + sourcePath + " to BDB env " + envFile);
                historyDB.sync();
                historyDB.close();
                targetEnv.close();
            } else {
                logger.info(count + " records found in " + sourcePath);
            }
        }

        return count;
    }

    /**
     * Populates a given StoredSortedMap (history map) from an old 
     * environment db or a persist log. If a map is not provided, only 
     * logs the entries that would have been populated.
     * 
     * @param sourcePath
     *            source of old entries: can be a path to an existing
     *            environment db, or a URL or path to a persist log
     * @param historyMap
     *            map to populate (or null for a dry run)
     * @return number of records
     * @throws DatabaseException
     * @throws IOException
     */
    public static int copyPersistSourceToHistoryMap(File context, String sourcePath,
            StoredSortedMap<String, AList> historyMap)
            throws DatabaseException, IOException, MalformedURLException, UnsupportedEncodingException {
        int count;
        // delegate depending on the source
        File sourceFile = FileUtils.maybeRelative(context, sourcePath);
        if (sourceFile.isDirectory()) {
            count = copyPersistEnv(sourceFile, historyMap);
        } else {
            BufferedReader persistLogReader = null;
            if (sourceFile.isFile()) {
                persistLogReader = CrawlerJournal.getBufferedReader(sourceFile);
            } else {
                URL sourceUrl = new URL(sourcePath);
                persistLogReader = CrawlerJournal.getBufferedReader(sourceUrl);
            }

            count = populatePersistEnvFromLog(persistLogReader, historyMap);
        }
        return count;
    }

    /**
     * Utility main for importing a log into a BDB-JE environment or moving a
     * database between environments (2 arguments), or simply dumping a log
     * to stderr in a more readable format (1 argument). 
     * 
     * @param args command-line arguments
     * @throws DatabaseException
     * @throws IOException
     */
    public static void main(String[] args) throws DatabaseException, IOException {
        Handler handler = new ConsoleHandler();
        handler.setLevel(Level.ALL);
        handler.setFormatter(new OneLineSimpleLogger());
        logger.addHandler(handler);
        logger.setUseParentHandlers(false);

        if (args.length == 2) {
            logger.setLevel(Level.INFO);
            populatePersistEnv(args[0], new File(args[1]));
        } else if (args.length == 1) {
            logger.setLevel(Level.FINE);
            populatePersistEnv(args[0], null);
        } else {
            System.out.println("Arguments: ");
            System.out.println("    source [target]");
            System.out.println("...where source is either a txtser log file or BDB env dir");
            System.out.println("and target, if present, is a BDB env dir. ");
            return;
        }
    }

    public static EnhancedEnvironment setupCopyEnvironment(File env) throws DatabaseException {
        return setupCopyEnvironment(env, false);
    }

    public static EnhancedEnvironment setupCopyEnvironment(File env, boolean readOnly) throws DatabaseException {
        EnvironmentConfig envConfig = new EnvironmentConfig();
        envConfig.setAllowCreate(true);
        envConfig.setReadOnly(readOnly);
        try {
            return new EnhancedEnvironment(env, envConfig);
        } catch (IllegalArgumentException iae) {
            throw new IllegalArgumentException(
                    "problem with specified environment " + env + "; is it already open?", iae);
        }
    }
}