dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.indexserver.CrawlLogIndexCache.java
Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 
 *  USA
 */

package dk.netarkivet.harvester.indexserver;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import is.hi.bok.deduplicator.CrawlDataIterator;
import is.hi.bok.deduplicator.DigestIndexer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;

import dk.netarkivet.common.distribute.indexserver.JobIndexCache;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.TimeUtils;
import dk.netarkivet.common.utils.ZipUtils;
import dk.netarkivet.harvester.HarvesterSettings;

/**
 * A cache that serves Lucene indices of crawl logs for given job IDs.
 * Uses the DigestIndexer in the deduplicator software:
 * http://deduplicator.sourceforge.net/apidocs/is/hi/bok/deduplicator/DigestIndexer.html
 * Upon combination of underlying files, each file in the Lucene index is
 * gzipped and the compressed versions are stored in the directory given by
 * getCacheFile().
 * The subclass has to determine in its constructor call which mime types are
 * included.
 */
public abstract class CrawlLogIndexCache extends CombiningMultiFileBasedCache<Long> implements JobIndexCache {
    /** Needed to find origin information, which is file+offset from CDX index.
     */
    private final CDXDataCache cdxcache = new CDXDataCache();

    /** the useBlacklist set to true results in docs matching the
       mimefilter being ignored. */
    private boolean useBlacklist;
    /** An regular expression for the mimetypes to include or exclude from
     * the index. See useBlackList.
     */
    private String mimeFilter;
    /** The log. */
    private static Log log = LogFactory.getLog(CrawlLogIndexCache.class.getName());
    /** The time to sleep between each check of completeness.*/
    private final long sleepintervalBetweenCompletenessChecks = Settings
            .getLong(HarvesterSettings.INDEXSERVER_INDEXING_CHECKINTERVAL);
    /** Number to separate logs the different combine tasks. */
    private int indexingJobCount = 0;

    /**
     * Constructor for the CrawlLogIndexCache class.
     * @param name The name of the CrawlLogIndexCache
     * @param blacklist Shall the mimefilter be considered a blacklist 
     *  or a whitelist?
     * @param mimeFilter A regular expression for the mimetypes to
     * exclude/include
     */
    public CrawlLogIndexCache(String name, boolean blacklist, String mimeFilter) {
        super(name, new CrawlLogDataCache());
        useBlacklist = blacklist;
        this.mimeFilter = mimeFilter;
    }

    /** Prepare data for combining.  This class overrides prepareCombine to
     * make sure that CDX data is available.
     *
     * @param ids Set of IDs that will be combined.
     * @return Map of ID->File of data to combine for the IDs where we could
     * find data.
     */
    protected Map<Long, File> prepareCombine(Set<Long> ids) {
        log.info("Starting to generate " + getCacheDir().getName() + " for the " + ids.size() + " jobs: " + ids);
        Map<Long, File> returnMap = super.prepareCombine(ids);
        Set<Long> missing = new HashSet<Long>();
        for (Long id : returnMap.keySet()) {
            Long cached = cdxcache.cache(id);
            if (cached == null) {
                missing.add(id);
            }
        }
        if (!missing.isEmpty()) {
            log.warn("Data not found for " + missing.size() + " jobs: " + missing);
        }
        for (Long id : missing) {
            returnMap.remove(id);
        }
        return returnMap;
    }

    /** Combine a number of crawl.log files into one Lucene index.  This index
     * is placed as gzip files under the directory returned by getCacheFile().
     *
     * @param rawfiles The map from job ID into crawl.log contents. No
     * null values are allowed in this map.
     */
    protected void combine(Map<Long, File> rawfiles) {
        indexingJobCount++;
        long datasetSize = rawfiles.values().size();
        log.info("Starting combine task #" + indexingJobCount + ". This combines a dataset with " + datasetSize
                + " crawl logs (thread = " + Thread.currentThread().getName() + ")");

        File resultDir = getCacheFile(rawfiles.keySet());
        Set<File> tmpfiles = new HashSet<File>();
        String indexLocation = resultDir.getAbsolutePath() + ".luceneDir";
        ThreadPoolExecutor executor = null;
        try {
            DigestIndexer indexer = createStandardIndexer(indexLocation);
            final boolean verboseIndexing = false;
            DigestOptions indexingOptions = new DigestOptions(this.useBlacklist, verboseIndexing, this.mimeFilter);
            long count = 0;
            Set<IndexingState> outstandingJobs = new HashSet<IndexingState>();
            final int maxThreads = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS);
            executor = new ThreadPoolExecutor(maxThreads, maxThreads, 0L, TimeUnit.MILLISECONDS,
                    new LinkedBlockingQueue<Runnable>());

            executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());

            for (Map.Entry<Long, File> entry : rawfiles.entrySet()) {
                Long jobId = entry.getKey();
                File crawlLog = entry.getValue();
                // Generate UUID to ensure a unique filedir for the index.
                File tmpFile = new File(FileUtils.getTempDir(), UUID.randomUUID().toString());
                tmpfiles.add(tmpFile);
                String localindexLocation = tmpFile.getAbsolutePath();
                Long cached = cdxcache.cache(jobId);
                if (cached == null) {
                    log.warn("Skipping the ingest of logs for job " + entry.getKey()
                            + ". Unable to retrieve cdx-file for job.");
                    continue;
                }
                File cachedCDXFile = cdxcache.getCacheFile(cached);

                // Dispatch this indexing task to a separate thread that 
                // handles the sorting of the logfiles and the generation
                // of a lucene index for this crawllog and cdxfile.
                count++;
                String taskID = count + " out of " + datasetSize;
                log.debug("Making subthread for indexing job " + jobId + " - task " + taskID);
                Callable<Boolean> task = new DigestIndexerWorker(localindexLocation, jobId, crawlLog, cachedCDXFile,
                        indexingOptions, taskID);
                Future<Boolean> result = executor.submit(task);
                outstandingJobs.add(new IndexingState(jobId, localindexLocation, result));
            }

            // wait for all the outstanding subtasks to complete.
            Set<Directory> subindices = new HashSet<Directory>();

            // Deadline for the combine-task
            long combineTimeout = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT);
            long timeOutTime = System.currentTimeMillis() + combineTimeout;

            // The indexwriter for the totalindex.
            IndexWriter totalIndex = indexer.getIndex();
            int subindicesInTotalIndex = 0;
            // Max number of segments in totalindex.
            int maxSegments = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS);

            final int ACCUMULATED_SUBINDICES_BEFORE_MERGING = 200;

            while (outstandingJobs.size() > 0) {
                log.info("Outstanding jobs in combine task #" + indexingJobCount + " is now "
                        + outstandingJobs.size());
                Iterator<IndexingState> iterator = outstandingJobs.iterator();
                if (timeOutTime < System.currentTimeMillis()) {
                    log.warn("Max indexing time exceeded for one index ("
                            + TimeUtils.readableTimeInterval(combineTimeout) + "). Indexing stops here, although"
                            + " missing subindices for " + outstandingJobs.size() + " jobs");
                    break;
                }
                while (iterator.hasNext() && subindices.size() < ACCUMULATED_SUBINDICES_BEFORE_MERGING) {
                    Future<Boolean> nextResult;
                    IndexingState next = iterator.next();
                    if (next.getResultObject().isDone()) {
                        nextResult = next.getResultObject();
                        try {
                            // check, if the indexing failed
                            if (nextResult.get()) {
                                subindices.add(new SimpleFSDirectory(new File(next.getIndex())));
                            } else {
                                log.warn("Indexing of job " + next.getJobIdentifier() + " failed.");
                            }

                        } catch (InterruptedException e) {
                            log.warn("Unable to get Result back from " + "indexing thread", e);
                        } catch (ExecutionException e) {
                            log.warn("Unable to get Result back from " + "indexing thread", e);
                        }
                        //remove the done object from the set
                        iterator.remove();
                    }
                }

                if (subindices.size() >= ACCUMULATED_SUBINDICES_BEFORE_MERGING) {

                    log.info("Adding " + subindices.size()
                            + " subindices to main index. Forcing index to contain max " + maxSegments
                            + " files (related to combine task # " + indexingJobCount + ")");
                    totalIndex.addIndexes(subindices.toArray(new Directory[0]));
                    totalIndex.forceMerge(maxSegments);
                    totalIndex.commit();
                    for (Directory luceneDir : subindices) {
                        luceneDir.close();
                    }
                    subindicesInTotalIndex += subindices.size();
                    log.info("Completed adding " + subindices.size() + " subindices to main index, now containing "
                            + subindicesInTotalIndex + " subindices" + "(related to combine task # "
                            + indexingJobCount + ")");
                    subindices.clear();
                } else {
                    sleepAwhile();
                }
            }

            log.info("Adding the final " + subindices.size()
                    + " subindices to main index. Forcing index to contain max " + maxSegments + " files "
                    + "(related to combine task # " + indexingJobCount + ")");

            totalIndex.addIndexes(subindices.toArray(new Directory[0]));
            totalIndex.forceMerge(maxSegments);
            totalIndex.commit();
            for (Directory luceneDir : subindices) {
                luceneDir.close();
            }
            subindices.clear();

            log.info("Adding operation completed (combine task # " + indexingJobCount + ")!");
            long docsInIndex = totalIndex.numDocs();

            indexer.close();
            log.info("Closed index (related to combine task # " + indexingJobCount);

            // Now the index is made, gzip it up.
            File totalIndexDir = new File(indexLocation);
            log.info("Gzip-compressing the individual " + totalIndexDir.list().length
                    + " index files of combine task # " + indexingJobCount);
            ZipUtils.gzipFiles(totalIndexDir, resultDir);
            log.info("Completed combine task # " + indexingJobCount + " that combined a dataset with " + datasetSize
                    + " crawl logs (entries in combined index: " + docsInIndex + ") - compressed index has size "
                    + FileUtils.getHumanReadableFileSize(resultDir));
        } catch (IOException e) {
            throw new IOFailure("Error setting up craw.log index framework for " + resultDir.getAbsolutePath(), e);
        } finally {
            // close down Threadpool-executor
            closeDownThreadpoolQuietly(executor);
            FileUtils.removeRecursively(new File(indexLocation));
            for (File temporaryFile : tmpfiles) {
                FileUtils.removeRecursively(temporaryFile);
            }
        }
    }

    /**
     * Try to release all resources connected to the given ThreadPoolExecutor.
     * @param executor a ThreadPoolExecutor
     */
    private void closeDownThreadpoolQuietly(ThreadPoolExecutor executor) {
        if (executor == null) {
            return;
        }
        if (!executor.isShutdown()) {
            executor.shutdownNow();
        }
    }

    /**
     * Helper class to sleep a little between completeness checks.
     */
    private void sleepAwhile() {
        try {
            Thread.sleep(sleepintervalBetweenCompletenessChecks);
        } catch (InterruptedException e) {
            log.trace("Was awoken early from sleep: ", e);
        }
    }

    /** Ingest a single crawl.log file using the corresponding CDX file to find
     * offsets.
     *
     * @param id ID of a job to ingest.
     * @param crawllogfile The file containing the crawl.log data for the job
     * @param cdxfile The file containing the cdx data for the job
     * @param options The digesting options used.
     * @param indexer The indexer to add to.
     */
    protected static void indexFile(Long id, File crawllogfile, File cdxfile, DigestIndexer indexer,
            DigestOptions options) {
        log.debug("Ingesting the crawl.log file '" + crawllogfile.getAbsolutePath() + "' related to job " + id);
        boolean blacklist = options.getUseBlacklist();
        final String mimefilter = options.getMimeFilter();
        final boolean verbose = options.getVerboseMode();

        CrawlDataIterator crawlLogIterator = null;
        File sortedCdxFile = null;
        File tmpCrawlLog = null;
        BufferedReader cdxBuffer = null;
        try {
            sortedCdxFile = getSortedCDX(cdxfile);
            cdxBuffer = new BufferedReader(new FileReader(sortedCdxFile));
            tmpCrawlLog = getSortedCrawlLog(crawllogfile);
            crawlLogIterator = new CDXOriginCrawlLogIterator(tmpCrawlLog, cdxBuffer);
            indexer.writeToIndex(crawlLogIterator, mimefilter, blacklist, "ERROR", verbose);
        } catch (IOException e) {
            throw new IOFailure("Fatal error indexing " + id, e);
        } finally {
            try {
                if (crawlLogIterator != null) {
                    crawlLogIterator.close();
                }
                if (tmpCrawlLog != null) {
                    FileUtils.remove(tmpCrawlLog);
                }
                if (cdxBuffer != null) {
                    cdxBuffer.close();
                }
                if (sortedCdxFile != null) {
                    FileUtils.remove(sortedCdxFile);
                }
            } catch (IOException e) {
                log.warn("Error cleaning up after" + " crawl log index cache generation", e);
            }
        }
    }

    /** Get a sorted, temporary CDX file corresponding to the given CDXfile.
        
     * @param cdxFile A cdxfile 
     * @return A temporary file with CDX info for that just sorted according
     * to the standard CDX sorting rules.  This file will be removed at the
     * exit of the JVM, but should be attempted removed when it is no longer
     * used.
     */
    protected static File getSortedCDX(File cdxFile) {
        try {
            final File tmpFile = File.createTempFile("sorted", "cdx", FileUtils.getTempDir());
            // This throws IOFailure, if the sorting operation fails 
            FileUtils.sortCDX(cdxFile, tmpFile);
            tmpFile.deleteOnExit();
            return tmpFile;
        } catch (IOException e) {
            throw new IOFailure("Error while making tmp file for " + cdxFile, e);
        }
    }

    /** Get a sorted, temporary crawl.log file from an unsorted one.
     *
     * @param file The file containing an unsorted crawl.log file.
     * @return A temporary file containing the entries sorted according to
     * URL.  The file will be removed upon exit of the JVM, but should be
     * attempted removed when it is no longer used.
     */
    protected static File getSortedCrawlLog(File file) {
        try {
            File tmpCrawlLog = File.createTempFile("sorted", "crawllog", FileUtils.getTempDir());
            // This throws IOFailure, if the sorting operation fails
            FileUtils.sortCrawlLog(file, tmpCrawlLog);
            tmpCrawlLog.deleteOnExit();
            return tmpCrawlLog;
        } catch (IOException e) {
            throw new IOFailure("Error creating sorted crawl log file for '" + file + "'", e);
        }
    }

    /**
     *  Create standard deduplication indexer.
     * 
     * @param indexLocation The full path to the indexing directory
     * @return the created deduplication indexer.
     * @throws IOException If unable to open the index.
     */
    protected static DigestIndexer createStandardIndexer(String indexLocation) throws IOException {
        // Setup Lucene for indexing our crawllogs
        // MODE_BOTH: Both URL's and Hash are indexed: Alternatives:
        // DigestIndexer.MODE_HASH or DigestIndexer.MODE_URL
        String indexingMode = DigestIndexer.MODE_BOTH;
        // used to be 'equivalent' setting
        boolean includeNormalizedURL = false;
        // used to be 'timestamp' setting
        boolean includeTimestamp = true;
        // used to be 'etag' setting
        boolean includeEtag = true;
        boolean addToExistingIndex = false;
        DigestIndexer indexer = new DigestIndexer(indexLocation, indexingMode, includeNormalizedURL,
                includeTimestamp, includeEtag, addToExistingIndex);
        return indexer;
    }
}