org.archive.bdb.BdbModule.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.bdb.BdbModule.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.bdb;

import java.io.Closeable;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.IOFileFilter;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.spring.ConfigPath;
import org.archive.util.FilesystemLinkMaker;
import org.archive.util.IdentityCacheable;
import org.archive.util.ObjectIdentityBdbManualCache;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.TextUtils;
import org.archive.util.bdbje.EnhancedEnvironment;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.util.DbBackup;

/**
 * Utility module for managing a shared BerkeleyDB-JE environment
 * 
 * @contributor pjack
 * @contributor gojomo
 */
public class BdbModule implements Lifecycle, Checkpointable, Closeable, DisposableBean {
    final private static Logger LOGGER = Logger.getLogger(BdbModule.class.getName());

    private static class DatabasePlusConfig implements Serializable {
        private static final long serialVersionUID = 1L;
        public transient Database database;
        public BdbConfig config;
    }

    /**
     * Configuration object for databases.  Needed because 
     * {@link DatabaseConfig} is not serializable.  Also it prevents invalid
     * configurations.  (All databases opened through this module must be
     * deferred-write, because otherwise they can't sync(), and you can't
     * run a checkpoint without doing sync() first.)
     * 
     * @author pjack
     *
     */
    public static class BdbConfig implements Serializable {
        private static final long serialVersionUID = 1L;

        protected boolean allowCreate;
        protected boolean sortedDuplicates;
        protected boolean transactional;
        protected boolean deferredWrite = true;

        public BdbConfig() {
        }

        public boolean isAllowCreate() {
            return allowCreate;
        }

        public void setAllowCreate(boolean allowCreate) {
            this.allowCreate = allowCreate;
        }

        public boolean getSortedDuplicates() {
            return sortedDuplicates;
        }

        public void setSortedDuplicates(boolean sortedDuplicates) {
            this.sortedDuplicates = sortedDuplicates;
        }

        public DatabaseConfig toDatabaseConfig() {
            DatabaseConfig result = new DatabaseConfig();
            result.setDeferredWrite(deferredWrite);
            result.setTransactional(transactional);
            result.setAllowCreate(allowCreate);
            result.setSortedDuplicates(sortedDuplicates);
            return result;
        }

        public boolean isTransactional() {
            return transactional;
        }

        public void setTransactional(boolean transactional) {
            this.transactional = transactional;
        }

        public void setDeferredWrite(boolean b) {
            this.deferredWrite = true;
        }
    }

    protected ConfigPath dir = new ConfigPath("bdbmodule subdirectory", "state");

    public ConfigPath getDir() {
        return dir;
    }

    public void setDir(ConfigPath dir) {
        this.dir = dir;
    }

    protected int cachePercent = -1;

    public int getCachePercent() {
        return cachePercent;
    }

    public void setCachePercent(int cachePercent) {
        this.cachePercent = cachePercent;
    }

    protected int cacheSize = -1;

    public int getCacheSize() {
        return cacheSize;
    }

    public void setCacheSize(int cacheSize) {
        this.cacheSize = cacheSize;
    }

    protected boolean useSharedCache = true;

    public boolean getUseSharedCache() {
        return useSharedCache;
    }

    public void setUseSharedCache(boolean useSharedCache) {
        this.useSharedCache = useSharedCache;
    }

    /**
     * Expected number of concurrent threads; used to tune nLockTables
     * according to JE FAQ
     * http://www.oracle.com/technology/products/berkeley-db/faq/je_faq.html#33
     */
    protected int expectedConcurrency = 64;

    public int getExpectedConcurrency() {
        return expectedConcurrency;
    }

    public void setExpectedConcurrency(int expectedConcurrency) {
        this.expectedConcurrency = expectedConcurrency;
    }

    /**
     * Whether to use hard-links to log files to collect/retain
     * the BDB log files needed for a checkpoint. Default is true. 
     * May not work on Windows (especially on pre-NTFS filesystems). 
     * If false, the BDB 'je.cleaner.expunge' value will be set to 
     * 'false', as well, meaning BDB will *not* delete obsolete JDB
     * files, but only rename the '.DEL'. They will have to be 
     * manually deleted to free disk space, but .DEL files referenced
     * in any checkpoint's 'jdbfiles.manifest' should be retained to
     * keep the checkpoint valid. 
     */
    protected boolean useHardLinkCheckpoints = true;

    public boolean getUseHardLinkCheckpoints() {
        return useHardLinkCheckpoints;
    }

    public void setUseHardLinkCheckpoints(boolean useHardLinkCheckpoints) {
        this.useHardLinkCheckpoints = useHardLinkCheckpoints;
    }

    private transient EnhancedEnvironment bdbEnvironment;

    private transient StoredClassCatalog classCatalog;

    @SuppressWarnings("rawtypes")
    private Map<String, ObjectIdentityCache> oiCaches = new ConcurrentHashMap<String, ObjectIdentityCache>();

    private Map<String, DatabasePlusConfig> databases = new ConcurrentHashMap<String, DatabasePlusConfig>();

    protected boolean isRunning = false;

    public BdbModule() {
    }

    public synchronized void start() {
        if (isRunning()) {
            return;
        }

        isRunning = true;

        try {
            boolean isRecovery = false;
            if (recoveryCheckpoint != null) {
                isRecovery = true;
                doRecover();
            }

            setup(getDir().getFile(), !isRecovery);
        } catch (DatabaseException e) {
            throw new IllegalStateException(e);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    }

    public boolean isRunning() {
        return isRunning;
    }

    public void stop() {
        isRunning = false;
    }

    protected void setup(File f, boolean create) throws DatabaseException, IOException {
        EnvironmentConfig config = new EnvironmentConfig();
        config.setAllowCreate(create);
        config.setLockTimeout(75, TimeUnit.MINUTES); // set to max

        if (getCacheSize() > 0) {
            config.setCacheSize(getCacheSize());
            if (getCachePercent() > 0) {
                LOGGER.warning("cachePercent and cacheSize are both set. Only cacheSize will be used.");
            }
        } else if (getCachePercent() > 0) {
            config.setCachePercent(getCachePercent());
        }

        config.setSharedCache(getUseSharedCache());

        // we take the advice literally from...
        // http://www.oracle.com/technology/products/berkeley-db/faq/je_faq.html#33
        long nLockTables = getExpectedConcurrency() - 1;
        while (!BigInteger.valueOf(nLockTables).isProbablePrime(Integer.MAX_VALUE)) {
            nLockTables--;
        }
        config.setConfigParam("je.lock.nLockTables", Long.toString(nLockTables));

        // triple this value to 6K because stats show many faults
        config.setConfigParam("je.log.faultReadSize", "6144");

        if (!getUseHardLinkCheckpoints()) {
            // to support checkpoints by textual manifest only, 
            // prevent BDB's cleaner from deleting log files
            config.setConfigParam("je.cleaner.expunge", "false");
        } // else leave whatever other setting was already in place

        org.archive.util.FileUtils.ensureWriteableDirectory(f);
        this.bdbEnvironment = new EnhancedEnvironment(f, config);
        this.classCatalog = this.bdbEnvironment.getClassCatalog();
        if (!create) {
            // freeze last log file -- so that originating checkpoint isn't fouled
            DbBackup dbBackup = new DbBackup(bdbEnvironment);
            dbBackup.startBackup();
            dbBackup.endBackup();
        }
    }

    public void closeDatabase(Database db) {
        try {
            closeDatabase(db.getDatabaseName());
        } catch (DatabaseException e) {
            LOGGER.log(Level.SEVERE, "Error getting db name", e);
        }
    }

    public void closeDatabase(String name) {
        DatabasePlusConfig dpc = databases.remove(name);
        if (dpc == null) {
            LOGGER.warning("No such database: " + name);
            return;
        }
        Database db = dpc.database;
        try {
            db.sync();
            db.close();
        } catch (DatabaseException e) {
            LOGGER.log(Level.WARNING, "Error closing db " + name, e);
        }
    }

    /**
     * Open a Database inside this BdbModule's environment, and 
     * remember it for automatic close-at-module-stop. 
     * 
     * @param name
     * @param config
     * @param usePriorData
     * @return
     * @throws DatabaseException
     */
    public Database openDatabase(String name, BdbConfig config, boolean usePriorData) throws DatabaseException {
        if (bdbEnvironment == null) {
            // proper initialization hasn't occurred
            throw new IllegalStateException("BdbModule not started");
        }
        if (databases.containsKey(name)) {
            DatabasePlusConfig dpc = databases.get(name);
            if (dpc.config == config) {
                // object-identical configs: OK to share DB
                return dpc.database;
            }
            // unshared config object: might be name collision; error
            throw new IllegalStateException("Database already exists: " + name);
        }

        DatabasePlusConfig dpc = new DatabasePlusConfig();
        if (!usePriorData) {
            try {
                bdbEnvironment.truncateDatabase(null, name, false);
            } catch (DatabaseNotFoundException e) {
                // Ignored
            }
        }
        dpc.database = bdbEnvironment.openDatabase(null, name, config.toDatabaseConfig());
        dpc.config = config;
        databases.put(name, dpc);
        return dpc.database;
    }

    public StoredClassCatalog getClassCatalog() {
        return classCatalog;
    }

    public <K extends Serializable> StoredQueue<K> getStoredQueue(String dbname, Class<K> clazz,
            boolean usePriorData) {
        try {
            Database queueDb;
            queueDb = openDatabase(dbname, StoredQueue.databaseConfig(), usePriorData);
            return new StoredQueue<K>(queueDb, clazz, getClassCatalog());
        } catch (DatabaseException e) {
            throw new RuntimeException(e);
        }

    }

    /**
     * Get an ObjectIdentityBdbCache, backed by a BDB Database of the 
     * given name, with the given value class type. If 'recycle' is true,
     * reuse values already in the database; otherwise start with an 
     * empty cache. 
     *  
     * @param <V>
     * @param dbName
     * @param recycle
     * @param valueClass
     * @return
     * @throws DatabaseException
     */
    public <V extends IdentityCacheable> ObjectIdentityBdbManualCache<V> getOIBCCache(String dbName,
            boolean recycle, Class<? extends V> valueClass) throws DatabaseException {
        if (!recycle) {
            try {
                bdbEnvironment.truncateDatabase(null, dbName, false);
            } catch (DatabaseNotFoundException e) {
                // ignored
            }
        }
        ObjectIdentityBdbManualCache<V> oic = new ObjectIdentityBdbManualCache<V>();
        oic.initialize(bdbEnvironment, dbName, valueClass, classCatalog);
        oiCaches.put(dbName, oic);
        return oic;
    }

    public <V extends IdentityCacheable> ObjectIdentityCache<V> getObjectCache(String dbName, boolean recycle,
            Class<V> valueClass) throws DatabaseException {
        return getObjectCache(dbName, recycle, valueClass, valueClass);
    }

    /**
     * Get an ObjectIdentityCache, backed by a BDB Database of the given 
     * name, with objects of the given valueClass type. If 'recycle' is
     * true, reuse values already in the database; otherwise start with 
     * an empty cache. 
     * 
     * @param <V>
     * @param dbName
     * @param recycle
     * @param valueClass
     * @return
     * @throws DatabaseException
     */
    public <V extends IdentityCacheable> ObjectIdentityCache<V> getObjectCache(String dbName, boolean recycle,
            Class<V> declaredClass, Class<? extends V> valueClass) throws DatabaseException {
        @SuppressWarnings("unchecked")
        ObjectIdentityCache<V> oic = oiCaches.get(dbName);
        if (oic != null) {
            return oic;
        }
        oic = getOIBCCache(dbName, recycle, valueClass);
        return oic;
    }

    private void writeObject(ObjectOutputStream out) throws IOException {
        out.defaultWriteObject();
    }

    public void startCheckpoint(Checkpoint checkpointInProgress) {
    }

    public void doCheckpoint(final Checkpoint checkpointInProgress) throws IOException {
        // First sync objectCaches
        for (@SuppressWarnings("rawtypes")
        ObjectIdentityCache oic : oiCaches.values()) {
            oic.sync();
        }

        try {
            // sync all databases
            for (DatabasePlusConfig dbc : databases.values()) {
                dbc.database.sync();
            }

            // Do a force checkpoint.  Thats what a sync does (i.e. doSync).
            CheckpointConfig chkptConfig = new CheckpointConfig();
            chkptConfig.setForce(true);

            // Mark Hayes of sleepycat says:
            // "The default for this property is false, which gives the current
            // behavior (allow deltas).  If this property is true, deltas are
            // prohibited -- full versions of internal nodes are always logged
            // during the checkpoint. When a full version of an internal node
            // is logged during a checkpoint, recovery does not need to process
            // it at all.  It is only fetched if needed by the application,
            // during normal DB operations after recovery. When a delta of an
            // internal node is logged during a checkpoint, recovery must
            // process it by fetching the full version of the node from earlier
            // in the log, and then applying the delta to it.  This can be
            // pretty slow, since it is potentially a large amount of
            // random I/O."
            // chkptConfig.setMinimizeRecoveryTime(true);
            bdbEnvironment.checkpoint(chkptConfig);
            LOGGER.fine("Finished bdb checkpoint.");

            DbBackup dbBackup = new DbBackup(bdbEnvironment);
            try {
                dbBackup.startBackup();

                File envCpDir = new File(dir.getFile(), checkpointInProgress.getName());
                org.archive.util.FileUtils.ensureWriteableDirectory(envCpDir);
                File logfilesList = new File(envCpDir, "jdbfiles.manifest");
                String[] filedata = dbBackup.getLogFilesInBackupSet();
                for (int i = 0; i < filedata.length; i++) {
                    File f = new File(dir.getFile(), filedata[i]);
                    filedata[i] += "," + f.length();
                    if (getUseHardLinkCheckpoints()) {
                        File hardLink = new File(envCpDir, filedata[i]);
                        if (!FilesystemLinkMaker.makeHardLink(f.getAbsolutePath(), hardLink.getAbsolutePath())) {
                            LOGGER.log(Level.SEVERE, "unable to create required checkpoint link " + hardLink);
                        }
                    }
                }
                FileUtils.writeLines(logfilesList, Arrays.asList(filedata));
                LOGGER.fine("Finished processing bdb log files.");
            } finally {
                dbBackup.endBackup();
            }
        } catch (DatabaseException e) {
            throw new IOException(e);
        }

        if (checkpointInProgress.getForgetAllButLatest()) {
            File[] oldEnvCpDirs = dir.getFile().listFiles(new FilenameFilter() {
                @Override
                public boolean accept(File dir, String name) {
                    return !name.equals(checkpointInProgress.getName())
                            && TextUtils.matches("cp\\d{5}-\\d{14}", name);
                }
            });
            for (File d : oldEnvCpDirs) {
                FileUtils.deleteDirectory(d);
            }
        }
    }

    @SuppressWarnings("unchecked")
    protected void doRecover() throws IOException {
        File cpDir = new File(dir.getFile(), recoveryCheckpoint.getName());
        File logfilesList = new File(cpDir, "jdbfiles.manifest");
        List<String> filesAndLengths = FileUtils.readLines(logfilesList);
        HashMap<String, Long> retainLogfiles = new HashMap<String, Long>();
        for (String line : filesAndLengths) {
            String[] fileAndLength = line.split(",");
            long expectedLength = Long.valueOf(fileAndLength[1]);
            retainLogfiles.put(fileAndLength[0], expectedLength);

            // check for files in checkpoint directory; relink to environment as necessary
            File cpFile = new File(cpDir, line);
            File destFile = new File(dir.getFile(), fileAndLength[0]);
            if (cpFile.exists()) {
                if (cpFile.length() != expectedLength) {
                    LOGGER.warning(cpFile.getName() + " expected " + expectedLength + " actual " + cpFile.length());
                    // TODO: is truncation necessary? 
                }
                if (destFile.exists()) {
                    if (!destFile.delete()) {
                        LOGGER.log(Level.SEVERE, "unable to delete obstructing file " + destFile);
                    }
                }

                boolean status = FilesystemLinkMaker.makeHardLink(cpFile.getAbsolutePath(),
                        destFile.getAbsolutePath());
                if (!status) {
                    LOGGER.log(Level.SEVERE, "unable to create required restore link " + destFile);
                }
            }

        }

        IOFileFilter filter = FileFilterUtils.orFileFilter(FileFilterUtils.suffixFileFilter(".jdb"),
                FileFilterUtils.suffixFileFilter(".del"));
        filter = FileFilterUtils.makeFileOnly(filter);

        // reverify environment directory is as it was at checkpoint time, 
        // deleting any extra files
        for (File f : dir.getFile().listFiles((FileFilter) filter)) {
            if (retainLogfiles.containsKey(f.getName())) {
                // named file still exists under original name
                long expectedLength = retainLogfiles.get(f.getName());
                if (f.length() != expectedLength) {
                    LOGGER.warning(f.getName() + " expected " + expectedLength + " actual " + f.length());
                    // TODO: truncate? this unexpected length mismatch
                    // probably only happens if there was already a recovery
                    // where the affected file was the last of the set, in 
                    // which case BDB appends a small amount of (harmless?) data
                    // to the previously-undersized file
                }
                retainLogfiles.remove(f.getName());
                continue;
            }
            // file as now-named not in restore set; check if un-".DEL" renaming needed
            String undelName = f.getName().replace(".del", ".jdb");
            if (retainLogfiles.containsKey(undelName)) {
                // file if renamed matches desired file name
                long expectedLength = retainLogfiles.get(undelName);
                if (f.length() != expectedLength) {
                    LOGGER.warning(f.getName() + " expected " + expectedLength + " actual " + f.length());
                    // TODO: truncate to expected size?
                }
                if (!f.renameTo(new File(f.getParentFile(), undelName))) {
                    throw new IOException("Unable to rename " + f + " to " + undelName);
                }
                retainLogfiles.remove(undelName);
            }
            // file not needed; delete/move-aside
            if (!f.delete()) {
                LOGGER.warning("unable to delete " + f);
                org.archive.util.FileUtils.moveAsideIfExists(f);
            }
            // TODO: log/warn of ruined later checkpoints? 
        }
        if (retainLogfiles.size() > 0) {
            // some needed files weren't present
            LOGGER.severe("Checkpoint corrupt, needed log files missing: " + retainLogfiles);
        }

    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {
    }

    protected Checkpoint recoveryCheckpoint;

    @Autowired(required = false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
    }

    public void close() {
        if (classCatalog == null) {
            return;
        }

        for (@SuppressWarnings("rawtypes")
        ObjectIdentityCache cache : oiCaches.values()) {
            try {
                cache.close();
            } catch (Exception e) {
                LOGGER.log(Level.SEVERE, "Error closing oiCache " + cache, e);
            }
        }

        List<String> dbNames = new ArrayList<String>(databases.keySet());
        for (String dbName : dbNames)
            try {
                closeDatabase(dbName);
            } catch (Exception e) {
                LOGGER.log(Level.SEVERE, "Error closing db " + dbName, e);
            }

        try {
            this.bdbEnvironment.sync();
            this.bdbEnvironment.close();
        } catch (Exception e) {
            LOGGER.log(Level.SEVERE, "Error closing environment.", e);
        }
    }

    public Database getDatabase(String name) {
        DatabasePlusConfig dpc = databases.get(name);
        if (dpc == null) {
            return null;
        }
        return dpc.database;
    }

    /** uniqueness serial number for temp map databases */
    protected long sn = 0;

    /**
     * Creates a database-backed TempStoredSortedMap for transient 
     * reporting requirements. Calling the returned map's destroy()
     * method when done discards the associated Database. 
     * 
     * @param <K>
     * @param <V>
     * @param dbName Database name to use; if null a name will be synthesized
     * @param keyClass Class of keys; should be a Java primitive type
     * @param valueClass Class of values; may be any serializable type
     * @param allowDuplicates whether duplicate keys allowed
     * @return
     */
    public <K, V> DisposableStoredSortedMap<K, V> getStoredMap(String dbName, Class<K> keyClass,
            Class<V> valueClass, boolean allowDuplicates, boolean usePriorData) {
        BdbConfig config = new BdbConfig();
        config.setSortedDuplicates(allowDuplicates);
        config.setAllowCreate(!usePriorData);
        Database mapDb;
        if (dbName == null) {
            dbName = "tempMap-" + System.identityHashCode(this) + "-" + sn;
            sn++;
        }
        final String openName = dbName;
        try {
            mapDb = openDatabase(openName, config, usePriorData);
        } catch (DatabaseException e) {
            throw new RuntimeException(e);
        }
        EntryBinding<V> valueBinding = TupleBinding.getPrimitiveBinding(valueClass);
        if (valueBinding == null) {
            valueBinding = new SerialBinding<V>(classCatalog, valueClass);
        }
        DisposableStoredSortedMap<K, V> storedMap = new DisposableStoredSortedMap<K, V>(mapDb,
                TupleBinding.getPrimitiveBinding(keyClass), valueBinding, true) {
            @Override
            public void dispose() {
                super.dispose();
                DatabasePlusConfig dpc = BdbModule.this.databases.remove(openName);
                if (dpc == null) {
                    BdbModule.LOGGER.log(Level.WARNING, "No such database: " + openName);
                }
            }
        };
        return storedMap;
    }

    @Override
    public void destroy() throws Exception {
        close();
    }

}