org.dspace.util.SolrUpgradePre6xStatistics.java Source code

Introduction

Here is the source code for org.dspace.util.SolrUpgradePre6xStatistics.java
Source

/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.util;

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.UUID;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.dspace.content.Bitstream;
import org.dspace.content.Community;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.CollectionService;
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.eperson.EPerson;
import org.dspace.eperson.Group;
import org.dspace.eperson.factory.EPersonServiceFactory;
import org.dspace.eperson.service.EPersonService;
import org.dspace.eperson.service.GroupService;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;

/**
 * CLI tool to upgrade legacy id references in SOLR statistics to DSpace 6 UUID's.
 * 
 * This command will need to be run iteratively over each statistics shard until all legacy id values have
 * been replaced.
 * 
 * If a legacy id cannot be resolved from the database, the id will remain unchanged.
 *   "field:* AND NOT(field:*-*)" can be used to locate legacy ids
 * 
 * See DS-3602 for the origin of this issue.  This code is targeted for inclusion in the DSpace 6.1 release.
 * 
 * Recommendation: for a large repository, run this command with -Xmx2000m if possible.
 * 
 * To process 1,000,000 statistics records, it took 60 min to complete.
 * 
 * @author Terry Brady, Georgetown University Library
 */
public class SolrUpgradePre6xStatistics {
    //Command line parameter constants
    private static final String INDEX_NAME_OPTION = "i";
    private static final String NUMREC_OPTION = "n";
    private static final String BATCH_OPTION = "b";
    private static final String TYPE_OPTION = "t";
    private static final String HELP_OPTION = "h";
    private static final int NUMREC_DEFAULT = 100000;
    private static final int BATCH_DEFAULT = 10000;

    //After processing each batch of updates to SOLR, evaulate if the hibernate cache needs to be cleared
    private static final int CACHE_LIMIT = 20000;

    private static final String INDEX_DEFAULT = "statistics";
    private static final String MIGQUERY = "(id:* AND -(id:*-*)) OR (scopeId:* AND -(scopeId:*-*)) OR (epersonid:* AND -(epersonid:*-*))";

    //Counters to determine the number of items to process
    private int numRec = NUMREC_DEFAULT;
    private int batchSize = BATCH_DEFAULT;

    //Cache management
    private int numProcessed = 0;
    private long totalCache = 0;
    private long numUncache = 0;
    private List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
    private Context context;

    //Enum to identify the named SOLR statistics fields to update
    private enum FIELD {
        id, scopeId, owningComm, owningColl, owningItem, epersonid, owner, submitter, actor;
    }

    //Logger
    private static final Logger log = Logger.getLogger(SolrUpgradePre6xStatistics.class);

    //DSpace Servcies
    private ConfigurationService configurationService = DSpaceServicesFactory.getInstance()
            .getConfigurationService();
    protected CommunityService communityService = ContentServiceFactory.getInstance().getCommunityService();
    protected CollectionService collectionService = ContentServiceFactory.getInstance().getCollectionService();
    protected ItemService itemService = ContentServiceFactory.getInstance().getItemService();
    protected BitstreamService bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
    protected EPersonService epersonService = EPersonServiceFactory.getInstance().getEPersonService();
    protected GroupService groupService = EPersonServiceFactory.getInstance().getGroupService();

    // This code will operate on one shard at a time, therefore the SOLR web service will be accessed directly rather
    // than make use of the DSpace Solr Logger which only writes to the current shard
    private HttpSolrServer server;

    //Allows for smart use of hibernate cache
    private Item lastItem = null;
    private Bitstream lastBitstream = null;

    //Report on process times
    private long startTime = -1;
    private long lastTime = -1;

    /**
     * Construct the utility class from the command line options
     * @param indexName name of the statistics shard to update
     * @param numRec    maximum number of records to process
     * @throws IOException
     * @throws SolrServerException
     */
    public SolrUpgradePre6xStatistics(String indexName, int numRec, int batchSize)
            throws SolrServerException, IOException {
        String serverPath = configurationService.getProperty("solr-statistics.server");
        serverPath = serverPath.replaceAll("statistics$", indexName);
        System.out.println("Connecting to " + serverPath);
        server = new HttpSolrServer(serverPath);
        server.setMaxTotalConnections(1);
        this.numRec = numRec;
        this.batchSize = batchSize;
        refreshContext();
    }

    /*
     * Process a batch of updates to SOLR
     */
    private void batchUpdateStats() throws SolrServerException, IOException {
        if (docs.size() > 0) {
            server.add(docs);
            server.commit(true, true);
            docs.clear();
        }
    }

    /**
     * Refresh the DSpace Context object in order to periodically release objects from memory
     * @throws IOException
     * @throws SolrServerException
     */
    private void refreshContext() throws SolrServerException, IOException {
        if (context != null) {
            try {
                totalCache += numUncache + context.getCacheSize();
            } catch (SQLException e) {
                log.warn(e.getMessage());
            }
        }
        this.context = new Context(Context.Mode.READ_ONLY);
        lastItem = null;
        lastBitstream = null;
        numUncache = 0;
    }

    /*
     * Compute the number of items that were cached by hibernate since the context was cleared.
     */
    private long getCacheCounts(boolean fromStart) {
        long count = 0;
        try {
            count = context.getCacheSize();
        } catch (Exception e) {
            //no action
        }
        count += this.numUncache;
        if (fromStart) {
            count += totalCache;
        }
        return count;
    }

    /**
     * Compute the time since the last batch was processed
     * 
     * @param fromStart
     *            if true, report on processing time since the start of the program
     * @return the time in ms since the start time
     */
    private long logTime(boolean fromStart) {
        long ret = 0;
        long cur = new Date().getTime();
        if (lastTime == -1) {
            startTime = cur;
        } else if (fromStart) {
            ret = cur - startTime;
        } else {
            ret = cur - lastTime;
        }
        lastTime = cur;
        return ret;
    }

    /*
     * Format ms count as h:mm:ss
     * 
     * @param dur Duration in ms
     * 
     * @return duration formatted as h:mm:ss
     */
    private String duration(long dur) {
        long sec = dur / 1000;
        long hh = sec / 3600;
        long mm = (sec % 3600) / 60;
        long ss = (sec % 60);
        return String.format("%d:%02d:%02d", hh, mm, ss);
    }

    /**
     * Print a status message appended with the processing time for the operation
     * 
     * @param header
     *            Message to display
     * @param fromStart
     *            if true, report on processing time since the start of the program
     */
    private void printTime(int numProcessed, boolean fromStart) {
        long dur = logTime(fromStart);
        long totalDur = logTime(true);
        String stotalDur = duration(totalDur);
        long cacheSize = 0;
        try {
            cacheSize = context.getCacheSize();
        } catch (SQLException e) {
            log.error("Cannot get cache size", e);
        }
        String label = fromStart ? "TOTAL" : "Processed";
        System.out.println(String.format("%s (%s; %s; %s)", String.format("\t%,12d %10s...", numProcessed, label),
                String.format("%,6d sec; %s", dur / 1000, stotalDur),
                String.format("DB cache: %,6d/%,8d", cacheSize, getCacheCounts(fromStart)),
                String.format("Docs: %,6d", docs.size())));
    }

    /*
     * Create command line option processor
     */
    private static Options makeOptions() {
        Options options = new Options();
        options.addOption(HELP_OPTION, "help", false, "Get help on options for this command.");
        options.addOption(INDEX_NAME_OPTION, "index-name", true,
                "The names of the indexes to process. At least one is required (default=statistics)");
        options.addOption(NUMREC_OPTION, "num-rec", true, "Total number of records to update (defaut=100,000).");
        options.addOption(BATCH_OPTION, "batch-size", true,
                "Number of records to batch update to SOLR at one time (default=10,000).");
        return options;
    }

    /**
     * A utility method to print out all available command-line options and exit
     * given the specified code.
     *
     * @param options
     *            the supported options.
     * @param exitCode
     *            the exit code to use. The method will call System#exit(int) with
     *            the given code.
     */
    private static void printHelpAndExit(Options options, int exitCode) {
        HelpFormatter myhelp = new HelpFormatter();
        myhelp.printHelp(SolrUpgradePre6xStatistics.class.getSimpleName() + "\n", options);
        System.out.println("\n\nCommand Defaults");
        System.out.println(
                "\tsolr-upgrade-statistics-6x [-i statistics] [-n num_recs_to_process] [-b num_rec_to_update_at_once]");
        System.out.println("");
        System.out.println(
                "\tAfter upgrading to DSpace 6, this process should be run iteratively over every statistics shard ");
        System.out.println("\t\tuntil there are no remaining records with legacy ids present.");
        System.out.println("\t\tThis process can be run while the system is in use.");
        System.out.println("");
        System.out.println("\tIt will take 20-30 min to process 1,000,000 legacy records. ");
        System.out.println("");
        System.out.println("\tUse the -n option to manage the workload on your server. ");
        System.out.println("\t\tTo process all records, set -n to 10000000 or to 100000000 (10M or 100M)");
        System.out.println("\tIf possible, please allocate 2GB of memory to this process (e.g. -Xmx2000m)");
        System.out.println("");
        System.out.println("\tThis process will rewrite most solr statistics records and may temporarily double ");
        System.out.println(
                "\t\tthe size of your statistics repositories.  Consider optimizing your solr repos when complete.");

        System.exit(exitCode);
    }

    /**
     * Entry point for command-line invocation
     * 
     * @param args
     *            command-line arguments; see help for description
     * @throws ParseException
     *             if the command-line arguments cannot be parsed
     */
    public static void main(String[] args) throws ParseException {
        CommandLineParser parser = new PosixParser();
        Options options = makeOptions();

        System.out.println(" * This process should be run iteratively over every statistics shard ");
        System.out.println(" * until there are no remaining records with legacy ids present.");
        System.out.println(" * This process can be run while the system is in use.");
        System.out.println(" * It is likely to take 1 hour/1,000,000 legacy records to be udpated.");
        System.out.println(" *");
        System.out.println(" * This process will rewrite most solr statistics records and may temporarily double ");
        System.out.println(
                " *\tthe size of your statistics repositories.  Consider optimizing your solr repos when complete.");
        System.out.println(" * -------------------------------------------------------------------");

        String indexName = INDEX_DEFAULT;
        int numrec = NUMREC_DEFAULT;
        int batchSize = BATCH_DEFAULT;
        try {
            CommandLine line = parser.parse(options, args);
            if (line.hasOption(HELP_OPTION)) {
                printHelpAndExit(options, 0);
            }

            if (line.hasOption(INDEX_NAME_OPTION)) {
                indexName = line.getOptionValue(INDEX_NAME_OPTION, INDEX_DEFAULT);
            } else {
                System.err.println("No index name provided, defaulting to : " + INDEX_DEFAULT);
            }

            if (line.hasOption(NUMREC_OPTION)) {
                numrec = Integer.parseInt(line.getOptionValue(NUMREC_OPTION, "" + NUMREC_DEFAULT));
            }
            if (line.hasOption(BATCH_OPTION)) {
                batchSize = Integer.parseInt(line.getOptionValue(BATCH_OPTION, "" + BATCH_DEFAULT));
            }

        } catch (ParseException e) {
            System.err.println("Cannot read command options");
            printHelpAndExit(options, 1);
        }

        try {
            SolrUpgradePre6xStatistics upgradeStats = new SolrUpgradePre6xStatistics(indexName, numrec, batchSize);
            upgradeStats.run();
        } catch (SolrServerException e) {
            log.error("Error querying stats", e);
        } catch (SQLException e) {
            log.error("Error querying stats", e);
        } catch (IOException e) {
            log.error("Error querying stats", e);
        }
    }

    /*
     * Report on the existence of legacy id records within a shard
     */
    private void runReport() throws SolrServerException {
        System.out.println();
        System.out.println("=================================================================");
        System.out.println("\t*** Statistics Records with Legacy Id ***\n");
        long total = runReportQuery();
        System.out.println("\t--------------------------------------");
        System.out.println(String.format("\t%,12d\t%s", total, "TOTAL"));
        System.out.println("=================================================================");
        System.out.println();
    }

    /*
     * Report on the existence of specific legacy id records within a shard
     */
    private long runReportQuery() throws SolrServerException {
        StringBuilder sb = new StringBuilder(MIGQUERY);
        SolrQuery sQ = new SolrQuery();
        sQ.setQuery(sb.toString());
        sQ.setFacet(true);
        sQ.addFacetField("type");
        sQ.addFacetField("scopeType");
        QueryResponse sr = server.query(sQ);

        long total = 0;
        long unexpected = 0;
        for (FacetField ff : sr.getFacetFields()) {
            String s = ff.getName().equals("type") ? "View" : "Search";
            for (FacetField.Count count : ff.getValues()) {
                String name = count.getName();
                int id = Integer.parseInt(name);
                if (id == Constants.COMMUNITY) {
                    name = "Community " + s;
                } else if (id == Constants.COLLECTION) {
                    name = "Collection " + s;
                } else if (id == Constants.ITEM) {
                    name = "Item " + s;
                } else if (id == Constants.BITSTREAM) {
                    name = "Bistream " + s;
                } else {
                    /*
                     * In testing, I discovered some unexpected values in the scopeType field. It
                     * looks like they may have been a result of a CSV import/export error. This
                     * will group any unexpected values into one report line.
                     */
                    unexpected += count.getCount();
                    continue;
                }
                System.out.println(String.format("\t%,12d\t%s", count.getCount(), name));
                total += count.getCount();
            }
        }
        if (unexpected > 0) {
            System.out.println(String.format("\t%,12d\t%s", unexpected, "Unexpected Type & Full Site"));
            total += unexpected;
        }
        long rem = sr.getResults().getNumFound() - total;
        if (rem > 0) {
            System.out.println(String.format("\t%,12d\t%s", rem, "Other Records"));
            total += rem;
        }
        return total;
    }

    /*
     * Process records with a legacy id. From the command line, the user may specify
     * records of a specific type to update Otherwise, the following sequence will
     * be applied in order to optimize hibernate caching.
     * 
     * Communities and Collections - retain in the cache since each is likely to be
     * re-used Items - retain in the cache until a new item is processed Bitstreams
     * - retain in the cache until a new bitstream is processed
     */
    private void run() throws SolrServerException, SQLException, IOException {
        runReport();
        logTime(false);
        for (int processed = updateRecords(MIGQUERY); (processed != 0)
                && (numProcessed < numRec); processed = updateRecords(MIGQUERY)) {
            printTime(numProcessed, false);
            batchUpdateStats();
            if (context.getCacheSize() > CACHE_LIMIT) {
                refreshContext();
            }
        }
        printTime(numProcessed, true);

        if (numProcessed > 0) {
            runReport();
        }
    }

    /*
     * Update records associated with a particular object id
     * 
     * @param query Query to retrieve all of the statistics records associated with
     * a particular object
     * 
     * @param field Field to use for grouping records
     * 
     * @return number of items processed. 0 indicates that no more work is available
     * (or the max processed has been reached).
     */
    private int updateRecords(String query) throws SolrServerException, SQLException, IOException {
        int initNumProcessed = numProcessed;
        SolrQuery sQ = new SolrQuery();
        sQ.setQuery(query);
        sQ.setRows(batchSize);

        // Ensure that items are grouped by id
        // Sort by id fails due to presense of id and string fields. The ord function
        // seems to help
        sQ.addSort("type", SolrQuery.ORDER.desc);
        sQ.addSort("scopeType", SolrQuery.ORDER.desc);
        sQ.addSort("ord(owningItem)", SolrQuery.ORDER.desc);
        sQ.addSort("id", SolrQuery.ORDER.asc);
        sQ.addSort("scopeId", SolrQuery.ORDER.asc);

        QueryResponse sr = server.query(sQ);
        SolrDocumentList sdl = sr.getResults();

        for (int i = 0; i < sdl.size() && (numProcessed < numRec); i++) {
            SolrDocument sd = sdl.get(i);
            SolrInputDocument input = ClientUtils.toSolrInputDocument(sd);
            input.remove("_version_");
            for (FIELD col : FIELD.values()) {
                mapField(input, col);
            }

            docs.add(input);
            ++numProcessed;
        }
        return numProcessed - initNumProcessed;
    }

    /*
     * Map solr fields from legacy ids to UUIDs.
     * 
     * The id field is interpreted by the type field. The scopeId field is
     * interpreted by scopeType field.
     * 
     * Legacy ids will be unchanged if they cannot be mapped
     * 
     * @param input The SOLR statistics document to be updated
     * 
     * @param col The SOLR field to update (if present)
     */
    private void mapField(SolrInputDocument input, FIELD col) throws SQLException {
        SolrInputField ifield = input.get(col.name());
        if (ifield != null) {
            Collection<Object> vals = ifield.getValues();
            ArrayList<String> newvals = new ArrayList<>();
            for (Object ovalx : vals) {
                //DS-3436 documented an issue in which multi-values in shards were converted to a comma separated string
                //It also produced strings containing "\" at the end of a value
                for (String oval : ovalx.toString().split(",")) {
                    oval = oval.replace("\\", "");
                    try {
                        UUID uuid = null;
                        if (col == FIELD.owner) {
                            if (oval.length() > 1) {
                                String owntype = oval.substring(0, 1);
                                int legacy = Integer.parseInt(oval.substring(1));
                                uuid = mapOwner(owntype, legacy);
                            }
                        } else {
                            int legacy = Integer.parseInt(oval);
                            if (col == FIELD.id) {
                                Object otype = input.getFieldValue("type");
                                if (otype != null) {
                                    int type = Integer.parseInt(otype.toString());
                                    uuid = mapType(type, legacy);
                                }
                            } else if (col == FIELD.scopeId) {
                                Object otype = input.getFieldValue("scopeType");
                                if (otype != null) {
                                    int type = Integer.parseInt(otype.toString());
                                    uuid = mapType(type, legacy);
                                }
                            } else {
                                uuid = mapId(col, legacy);
                            }
                        }
                        if (uuid != null) {
                            if (!newvals.contains(uuid.toString())) {
                                newvals.add(uuid.toString());
                            }
                        } else {
                            String s = oval + "-unmigrated";
                            if (!newvals.contains(s)) {
                                newvals.add(s);
                            }
                        }
                    } catch (NumberFormatException e) {
                        log.warn("Non numeric legacy id " + col.name() + ":" + oval);
                    }
                }
            }
            if (newvals.size() > 0) {
                input.removeField(col.name());
                for (String nv : newvals) {
                    input.addField(col.name(), nv);
                }
            }
        }
    }

    /*
     * Determine if the last processed item should be cleared from the hibernate
     * cache
     * 
     * @param item Current item being processed
     */
    private void checkLastItem(Item item) throws SQLException {
        if (item != null) {
            if (lastItem == null) {
                lastItem = item;
            } else if (!lastItem.getID().equals(item.getID())) {
                numUncache++;
                context.uncacheEntity(lastItem);
                lastItem = item;
            }
        }
    }

    /*
     * Determine if the last processed bitstream should be cleared from the
     * hibernate cache
     * 
     * @param bitstream Current bitstream being processed
     */
    private void checkLastBitstream(Bitstream bitstream) throws SQLException {
        if (bitstream != null) {
            if (lastBitstream == null) {
                lastBitstream = bitstream;
            } else if (!lastBitstream.getID().equals(bitstream.getID())) {
                numUncache++;
                context.uncacheEntity(lastBitstream);
                lastBitstream = bitstream;
            }
        }
    }

    /*
     * Retrieve the UUID corresponding to a legacy id found in a SOLR statistics
     * record
     * 
     * @param col Solr Statistic Field being processed
     * 
     * @param val Value to lookup as a legacy id
     */
    private UUID mapId(FIELD col, int val) throws SQLException {

        if (col == FIELD.owningComm) {
            Community comm = communityService.findByLegacyId(context, val);
            return comm == null ? null : comm.getID();
        }
        if (col == FIELD.owningColl) {
            org.dspace.content.Collection coll = collectionService.findByLegacyId(context, val);
            return coll == null ? null : coll.getID();
        }
        if (col == FIELD.owningItem) {
            Item item = itemService.findByLegacyId(context, val);
            checkLastItem(item);
            return item == null ? null : item.getID();
        }
        if (col == FIELD.epersonid || col == FIELD.actor || col == FIELD.submitter) {
            EPerson per = epersonService.findByLegacyId(context, val);
            return per == null ? null : per.getID();
        }
        return null;
    }

    /*
     * Retrieve the UUID corresponding to a legacy id found in a SOLR statistics
     * record
     * 
     * @param type Identifying type field for id OR scopeType field for scopeId
     * 
     * @param val Value to lookup as a legacy id
     */
    private UUID mapType(int type, int val) throws SQLException {
        if (type == Constants.COMMUNITY) {
            Community comm = communityService.findByLegacyId(context, val);
            return comm == null ? null : comm.getID();
        }
        if (type == Constants.COLLECTION) {
            org.dspace.content.Collection coll = collectionService.findByLegacyId(context, val);
            return coll == null ? null : coll.getID();
        }
        if (type == Constants.ITEM) {
            Item item = itemService.findByLegacyId(context, val);
            checkLastItem(item);
            return item == null ? null : item.getID();
        }
        if (type == Constants.BITSTREAM) {
            Bitstream bit = bitstreamService.findByLegacyId(context, val);
            UUID uuid = bit == null ? null : bit.getID();
            // A bitstream is unlikely to be processed more than once, to clear immediately
            checkLastBitstream(bit);
            return uuid;
        }
        return null;
    }

    /*
     * Retrieve the UUID corresponding to a legacy owner found in a SOLR statistics
     * record Legacy owner fields are prefixed in solr with "e" or "g"
     * 
     * @param owntype Identifying type field (e - eperson, g - group)
     * 
     * @param val Value to lookup as a legacy id
     */
    private UUID mapOwner(String owntype, int val) throws SQLException {
        if (owntype.equals("e")) {
            EPerson per = epersonService.findByLegacyId(context, val);
            return per == null ? null : per.getID();
        } else if (owntype.equals("g")) {
            Group perg = groupService.findByLegacyId(context, val);
            return perg == null ? null : perg.getID();
        }
        return null;
    }

}