org.apache.jackrabbit.oak.plugins.document.JournalGarbageCollector.java Source code

Introduction

Here is the source code for org.apache.jackrabbit.oak.plugins.document.JournalGarbageCollector.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.jackrabbit.oak.plugins.document;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Stopwatch;

/**
 * The JournalGarbageCollector can clean up JournalEntries that are older than a
 * particular age.
 * <p>
 * It would typically be invoked in conjunction with the VersionGarbageCollector
 * but must not be confused with that one - 'journal' refers to the separate
 * collection that contains changed paths per background writes used for
 * observation.
 */
public class JournalGarbageCollector {

    private final DocumentStore ds;

    private static final Logger log = LoggerFactory.getLogger(JournalGarbageCollector.class);

    public JournalGarbageCollector(DocumentNodeStore nodeStore) {
        this.ds = nodeStore.getDocumentStore();
    }

    /**
     * Deletes entries in the journal that are older than the given
     * maxRevisionAge.
     *
     * @param maxRevisionAge entries older than this age will be removed
     * @param unit           the timeunit for maxRevisionAge
     * @return the number of entries that have been removed
     */
    public int gc(long maxRevisionAge, int batchSize, TimeUnit unit) {
        long maxRevisionAgeInMillis = unit.toMillis(maxRevisionAge);
        if (log.isDebugEnabled()) {
            log.debug("gc: Journal garbage collection starts with maxAge: {} min., batch size: {}.",
                    TimeUnit.MILLISECONDS.toMinutes(maxRevisionAgeInMillis), batchSize);
        }
        Stopwatch sw = Stopwatch.createStarted();

        // the journal has ids of the following format:
        // 1-0000014db9aaf710-00000001
        // whereas the first number is the cluster node id.
        // now, this format prevents from doing a generic
        // query to get all 'old' entries, as the documentstore
        // can only query for a sequential list of entries.
        // (and the cluster node id here partitions the set
        // of entries that we have to delete)
        // To account for that, we simply iterate over all 
        // cluster node ids and clean them up individually.
        // Note that there are possible alternatives, such
        // as: let each node clean up its own old entries
        // but the chosen path is also quite simple: it can
        // be started on any instance - but best on only one.
        // if it's run on multiple concurrently, then they
        // will compete at deletion, which is not optimal
        // due to performance, but does not harm.

        // 1. get the list of cluster node ids
        final List<ClusterNodeInfoDocument> clusterNodeInfos = ClusterNodeInfoDocument.all(ds);
        int numDeleted = 0;
        for (ClusterNodeInfoDocument clusterNodeInfoDocument : clusterNodeInfos) {
            // current algorithm is to simply look at all cluster nodes
            // irrespective of whether they are active or inactive etc.
            // this could be optimized for inactive ones: at some point, all
            // journal entries of inactive ones would have been cleaned up
            // and at that point we could stop including those long-time-inactive ones.
            // that 'long time' aspect would have to be tracked though, to be sure
            // we don't leave garbage.
            // so simpler is to quickly do a query even for long-time inactive ones
            final int clusterNodeId = clusterNodeInfoDocument.getClusterId();

            // 2. iterate over that list and do a query with
            //    a limit of 'batch size'
            boolean branch = false;
            long startPointer = 0;
            while (true) {
                String fromKey = JournalEntry.asId(new Revision(startPointer, 0, clusterNodeId, branch));
                String toKey = JournalEntry.asId(new Revision(System.currentTimeMillis() - maxRevisionAgeInMillis,
                        Integer.MAX_VALUE, clusterNodeId, branch));
                List<JournalEntry> deletionBatch = ds.query(Collection.JOURNAL, fromKey, toKey, batchSize);
                if (deletionBatch.size() > 0) {
                    ds.remove(Collection.JOURNAL, asKeys(deletionBatch));
                    numDeleted += deletionBatch.size();
                }
                if (deletionBatch.size() < batchSize) {
                    if (!branch) {
                        // do the same for branches:
                        // this will start at the beginning again with branch set to true
                        // and eventually finish too
                        startPointer = 0;
                        branch = true;
                        continue;
                    }
                    break;
                }
                startPointer = deletionBatch.get(deletionBatch.size() - 1).getRevisionTimestamp();
            }
        }

        sw.stop();

        if (numDeleted > 0) {
            log.info("gc: Journal garbage collection took {}, deleted {} entries that were older than {} min.", sw,
                    numDeleted, TimeUnit.MILLISECONDS.toMinutes(maxRevisionAgeInMillis));
        }
        return numDeleted;
    }

    private List<String> asKeys(List<JournalEntry> deletionBatch) {
        final List<String> keys = new ArrayList<String>(deletionBatch.size());
        for (JournalEntry e : deletionBatch) {
            keys.add(e.getId());
        }
        return keys;
    }

}