uk.ac.susx.tag.classificationframework.datastructures.CacheManager.java Source code

Introduction

Here is the source code for uk.ac.susx.tag.classificationframework.datastructures.CacheManager.java
Source

package uk.ac.susx.tag.classificationframework.datastructures;

/*
 * #%L
 * CacheManager.java - classificationframework - CASM Consulting - 2,013
 * %%
 * Copyright (C) 2013 - 2014 CASM Consulting
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.ServerAddress;
import org.apache.commons.math.fraction.BigFraction;
import uk.ac.susx.tag.classificationframework.exceptions.CachingException;
import uk.ac.susx.tag.classificationframework.featureextraction.pipelines.FeatureExtractionPipeline;

import java.net.UnknownHostException;
import java.util.List;
import java.util.Set;

/**
 * Caching procedure:
 *
 *  CacheManager cm = new CacheManager(hostname, port);
 *
 *  FeatureExtractionPipeline pipeline = PipelineFactory.createCMUPipeline(true , true);
 *
 *  cm.assignCache(databaseName, collectionName, pipeline);
 *
 *  // Then later...
 *
 *  cm.close();
 *
 * Note the following:
 *
 *  - If you change the configuration of any of the pipeline's DocProcessors, or add or remove
 *    new ones, then you should call pipeline.updateCachingConfiguration().
 *
 *  - If you want the pipeline to use the cache but NOT update it, then call pipeline.setUpdateCache(false)
 *
 *  - You can use the cache manager with a try with resources
 *
 *  - There's a convenience static method for using a pipeline to cache a bunch of instances
 *
 * User: Andrew D. Robertson
 * Date: 21/01/2014
 * Time: 17:32
 */
public class CacheManager implements AutoCloseable {

    private MongoClient client;

    public CacheManager(MongoClient client) {
        this.client = client;
        ensureHashingIndexExists();
    }

    public CacheManager(String hostname, int port) throws UnknownHostException {
        this(new MongoClient(new ServerAddress(hostname, port)));
    }

    public void close() {
        client.close();
    }

    /**
     * Use this method to assign a cache to a pipeline. The benefit of this method is that it
     * creates a database called "cacheManagerMetadata" with a collection called "hashingIndex"
     * which tracks the mapping from pipeline configuration hash to its string version.
     *
     * @param databaseName Database you want to do caching in
     * @param collectionName Collection you want to do caching in
     * @param pipeline Pipeline to which you wish to assign a cache
     * @param allowUpdates Allow the pipeline to cache new instances if it has to perform new processing
     * @param overwriteCollidingHashes True if when manager discovers colliding hashes, you want it to overwrite the old string value
     */
    public void assignCache(String databaseName, String collectionName, FeatureExtractionPipeline pipeline,
            boolean allowUpdates, boolean overwriteCollidingHashes) {
        DB db = client.getDB(databaseName);
        DBCollection cache = db.collectionExists(collectionName) ? db.getCollection(collectionName)
                : setupCollection(db, collectionName);
        pipeline.setCache(cache, allowUpdates);
        addToHashingIndex(pipeline.getCacheConfiguration(), pipeline.getCacheConfigurationString(),
                overwriteCollidingHashes);
    }

    public void assignCache(String databaseName, String collectionName, FeatureExtractionPipeline pipeline) {
        assignCache(databaseName, collectionName, pipeline, true, true);
    }

    /**
     * Cache the DocProcessing phase of a collection of Instances.
     * @param hostname Mongo host name
     * @param port Mongo port
     * @param databaseName Mongo database name
     * @param collectionName Mongo collection name
     * @param pipeline Pipeline used for processing the Instances
     * @param documents the Instances
     */
    public static void cache(String hostname, int port, String databaseName, String collectionName,
            FeatureExtractionPipeline pipeline, Iterable<Instance> documents) throws UnknownHostException {

        try (CacheManager cm = new CacheManager(hostname, port)) {
            cm.assignCache(databaseName, collectionName, pipeline);
            for (Instance document : documents) {
                pipeline.processDocument(document);
            }
        }
    }

    /**
     * Cache the doc processed version of Instance (like cache()), except here, replace the previously
     * cached versions.
     */
    public static void reCache(String hostname, int port, String databaseName, String collectionName,
            FeatureExtractionPipeline pipeline, Iterable<Instance> documents) throws UnknownHostException {

        try (CacheManager cm = new CacheManager(hostname, port)) {
            cm.assignCache(databaseName, collectionName, pipeline);
            for (Instance document : documents) {
                pipeline.reCache(document);
            }
        }
    }

    public void reCache(String databaseName, String collectionName, FeatureExtractionPipeline pipeline,
            Iterable<Instance> documents) {
        assignCache(databaseName, collectionName, pipeline);
        for (Instance document : documents) {
            pipeline.reCache(document);
        }
    }

    /*
     * Checking methods
     */

    /**
     * Determines how many documents are cached under a particular dataset with a particular pipeline config.
     */
    public long numberCached(String databaseName, String collectionName, FeatureExtractionPipeline pipeline) {
        return client.getDB(databaseName).getCollection(collectionName)
                .count(new BasicDBObject("pipelineConfig", pipeline.getCacheConfiguration()));
    }

    /**
     * Given some dataset of instances and a pipeline to process them, calculate the fraction of those
     * instances which are already in the specified cache.
     */
    public double fractionCachedOfDataset(String databaseName, String collectionName,
            FeatureExtractionPipeline pipeline, Iterable<Instance> documents) {
        long total = 0;
        long cached = 0;
        DB database = client.getDB(databaseName);
        if (database.collectionExists(collectionName)) {
            DBCollection dataset = database.getCollection(collectionName);
            if (dataset.findOne() == null)
                return 0.0; // No items in collection
            int config = pipeline.getCacheConfiguration();

            for (Instance document : documents) {
                total++;
                if (null != dataset
                        .findOne(new BasicDBObject("pipelineConfig", config).append("instanceID", document.id))) {
                    cached++;
                }
            }
            return new BigFraction(cached, total).doubleValue();
        } else {
            return 0.0;
        }
    }

    /*
     * Listing methods
     */

    public List<String> getDatabaseNames() {
        return client.getDatabaseNames();
    }

    public Set<String> getCollectionNames(String databaseName) {
        return client.getDB(databaseName).getCollectionNames();
    }

    public List<Integer> getCacheConfigs(String databaseName, String collectionName) {
        DBCollection collection = client.getDB(databaseName).getCollection(collectionName);
        return collection.distinct("pipelineConfig"); // Shh... It'll all be okay... We only ever add ints to this field
    }

    /*
     * Delete methods
     */

    public void deleteHashingIndex() {
        client.getDB("cacheManagerMetadata").getCollection("hashingIndex").drop();
    }

    public void deleteCacheManagerMetaData() {
        client.getDB("cacheManagerMetadata").dropDatabase();
    }

    public void deleteDatabase(String databaseName) {
        client.getDB(databaseName).dropDatabase();
    }

    public void deleteDataset(String databaseName, String collectionName) {
        client.getDB(databaseName).getCollection(collectionName).drop();
    }

    public void deleteCache(String databaseName, String collectionName, FeatureExtractionPipeline pipeline) {
        deleteCache(databaseName, collectionName, pipeline.getCacheConfiguration());
    }

    public void deleteCache(String databaseName, String collectionName, int configuration) {
        DB db = client.getDB(databaseName);
        if (db.collectionExists(collectionName)) {
            DBCollection collection = db.getCollection(collectionName);
            collection.remove(new BasicDBObject("pipelineConfig", configuration));
        }
    }

    private DBCollection setupCollection(DB database, String collectionName) {
        DBCollection collection = database.getCollection(collectionName);

        // Setup for compound index
        BasicDBObject obj = new BasicDBObject();
        obj.put("pipelineConfig", 1);
        obj.put("instanceID", 1);

        // Create options
        BasicDBObject opts = new BasicDBObject();
        opts.put("unique", true);

        collection.ensureIndex(obj, opts);

        //        collection.createIndex(new BasicDBObject("instanceID", 1));
        //        collection.createIndex(new BasicDBObject("pipelineConfig", 1));

        return collection;
    }

    /**
     * Ensure that the hashing index has been created.
     */
    private void ensureHashingIndexExists() {
        DB metadataDB = client.getDB("cacheManagerMetadata");
        if (!metadataDB.collectionExists("hashingIndex")) {
            DBCollection hashingIndex = metadataDB.getCollection("hashingIndex");
            hashingIndex.ensureIndex(new BasicDBObject("hashValue", 1), new BasicDBObject("unique", 1));
        }
    }

    /**
     * Add a (hash, stringvalue) pair to the hashing index.
     * @param hashValue The hash of a pipelines tokeniser and docprocessor config.
     * @param stringValue The string value of the config
     * @param overwrite If true, and the hash collides with another hash (i.e. the string value differs), then overwrite the old
     */
    private void addToHashingIndex(int hashValue, String stringValue, boolean overwrite) {
        DBCollection hashingIndex = client.getDB("cacheManagerMetadata").getCollection("hashingIndex");

        DBObject currentIndex = hashingIndex.findOne(new BasicDBObject("hashValue", hashValue));

        BasicDBObject hash = new BasicDBObject();
        hash.put("hashValue", hashValue);
        hash.put("stringValue", stringValue);

        if (currentIndex == null) { // No current index for this hash, so just add this one
            hashingIndex.insert(hash);
        } else if (!((String) currentIndex.get("stringValue")).equals(stringValue)) { // Problem! The hashes match, but the strings representations are different!
            if (overwrite) { // Overwrite the old
                hashingIndex.update(currentIndex, hash);
            } else {
                throw new CachingException(
                        "The hash configuration of this pipeline has been seen before, but their string representations are different.");
            }
        }
    }
}