Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.frontier.precedence; import static org.archive.modules.CoreAttributeConstants.A_PRECALC_PRECEDENCE; import java.util.Map; import org.archive.bdb.BdbModule; import org.archive.modules.CrawlURI; import org.archive.modules.recrawl.PersistProcessor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; import com.sleepycat.bind.serial.SerialBinding; import com.sleepycat.bind.serial.StoredClassCatalog; import com.sleepycat.bind.tuple.StringBinding; import com.sleepycat.collections.StoredSortedMap; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseException; /** * UriPrecedencePolicy which assigns URIs a precedence from a value that * was preloaded for them into the uri-history database. * * NOTE: Because this is a Lifecycle bean requiring start and stop, it * should not be instantiated as an anonymous inner bean. Rather, it * should be a top-level named bean, then either autowired or placed-by- * reference into the frontier. */ public class PreloadedUriPrecedencePolicy extends BaseUriPrecedencePolicy implements Lifecycle { private static final long serialVersionUID = -1474685153995064123L; /** Backup URI precedence assignment policy to use. */ { setDefaultUriPrecedencePolicy(new BaseUriPrecedencePolicy()); } public UriPrecedencePolicy getDefaultUriPrecedencePolicy() { return (UriPrecedencePolicy) kp.get("defaultUriPrecedencePolicy"); } public void setDefaultUriPrecedencePolicy(UriPrecedencePolicy policy) { kp.put("defaultUriPrecedencePolicy", policy); } // TODO: refactor to better share code with PersistOnlineProcessor protected BdbModule bdb; @Autowired public void setBdbModule(BdbModule bdb) { this.bdb = bdb; } protected StoredSortedMap<String, ?> store; protected Database historyDb; @SuppressWarnings({ "unchecked", "rawtypes" }) public void start() { if (isRunning()) { return; } store = null; String dbName = PersistProcessor.URI_HISTORY_DBNAME; try { StoredClassCatalog classCatalog = bdb.getClassCatalog(); BdbModule.BdbConfig dbConfig = PersistProcessor.HISTORY_DB_CONFIG; historyDb = bdb.openDatabase(dbName, dbConfig, true); SerialBinding sb = new SerialBinding(classCatalog, Map.class); StoredSortedMap historyMap = new StoredSortedMap(historyDb, new StringBinding(), sb, true); store = historyMap; } catch (DatabaseException e) { throw new RuntimeException(e); } } public boolean isRunning() { return historyDb != null; } public void stop() { if (!isRunning()) { return; } // BdbModule will handle closing of DB // XXX happens at finish; move to teardown? historyDb = null; } /* (non-Javadoc) * @see org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy#uriScheduled(org.archive.crawler.datamodel.CrawlURI) */ @Override public void uriScheduled(CrawlURI curi) { int precedence = calculatePrecedence(curi); if (precedence == 0) { // fall back to configured default policy getDefaultUriPrecedencePolicy().uriScheduled(curi); return; } curi.setPrecedence(precedence); } /* (non-Javadoc) * @see org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy#calculatePrecedence(org.archive.crawler.datamodel.CrawlURI) */ @Override protected int calculatePrecedence(CrawlURI curi) { mergePrior(curi); Integer preloadPrecedence = (Integer) curi.getData().get(A_PRECALC_PRECEDENCE); if (preloadPrecedence == null) { return 0; } return super.calculatePrecedence(curi) + preloadPrecedence; } /** * Merge any data from the Map stored in the URI-history store into the * current instance. * * TODO: ensure compatibility with use of PersistLoadProcessor; suppress * double-loading * @param curi CrawlURI to receive prior state data */ protected void mergePrior(CrawlURI curi) { String key = PersistProcessor.persistKeyFor(curi); @SuppressWarnings({ "rawtypes", "unchecked" }) Map<String, Map> prior = (Map<String, Map>) store.get(key); if (prior != null) { // merge in keys curi.getData().putAll(prior); } } }