Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.processor; import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_CUSTOM_PROCESSOR; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintWriter; import java.util.HashMap; import java.util.Iterator; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; import org.archive.modules.Processor; import org.archive.modules.deciderules.AcceptDecideRule; import org.archive.modules.deciderules.DecideResult; import org.archive.modules.deciderules.DecideRule; import org.archive.spring.ConfigPath; import org.archive.util.ArchiveUtils; import org.archive.util.fingerprint.ArrayLongFPCache; import org.springframework.context.Lifecycle; import st.ata.util.FPGenerator; /** * A simple crawl splitter/mapper, dividing up CrawlURIs/CrawlURIs * between crawlers by diverting some range of URIs to local log files * (which can then be imported to other crawlers). * * May operate on a CrawlURI (typically early in the processing chain) or * its CrawlURI outlinks (late in the processing chain, after * LinksScoper), or both (if inserted and configured in both places). * * <p>Applies a map() method, supplied by a concrete subclass, to * classKeys to map URIs to crawlers by name. * * <p>One crawler name is distinguished as the 'local name'; URIs mapped to * this name are not diverted, but continue to be processed normally. * * <p>If using the JMX importUris operation importing URLs dropped by * a {@link CrawlMapper} instance, use <code>recoveryLog</code> style. * * @author gojomo * @version $Date$, $Revision$ */ public abstract class CrawlMapper extends Processor implements Lifecycle { /** * PrintWriter which remembers the File to which it writes. */ private class FilePrintWriter extends PrintWriter { File file; public FilePrintWriter(File file) throws FileNotFoundException { super(new BufferedOutputStream(new FileOutputStream(file))); this.file = file; } public File getFile() { return file; } } /** * Whether to apply the mapping to a URI being processed itself, for example * early in processing (while its status is still 'unattempted'). */ protected boolean checkUri = true; public boolean getCheckUri() { return this.checkUri; } public void setCheckUri(boolean check) { this.checkUri = check; } /** * Whether to apply the mapping to discovered outlinks, for example after * extraction has occurred. */ protected boolean checkOutlinks = true; public boolean getCheckOutlinks() { return this.checkOutlinks; } public void setCheckOutlinks(boolean check) { this.checkOutlinks = check; } /** * Decide rules to determine if an outlink is subject to mapping. */ protected DecideRule outlinkRule = new AcceptDecideRule(); public DecideRule getOutlinkRule() { return this.outlinkRule; } public void setOutlinkRule(DecideRule rule) { this.outlinkRule = rule; } /** * Name of local crawler node; mappings to this name result in normal * processing (no diversion). */ protected String localName = "."; public String getLocalName() { return this.localName; } public void setLocalName(String name) { this.localName = name; } /** * Directory to write diversion logs. */ protected ConfigPath diversionDir = new ConfigPath("diverted URIs subdirectory", "diversions"); public ConfigPath getDiversionDir() { return this.diversionDir; } public void setDiversionDir(ConfigPath path) { this.diversionDir = path; } /** * Number of timestamp digits to use as prefix of log names (grouping all * diversions from that period in a single log). Default is 10 (hourly log * rotation). * */ protected int rotationDigits = 10; public int getRotationDigits() { return this.rotationDigits; } public void setRotationDigits(int digits) { this.rotationDigits = digits; } /** * Mapping of target crawlers to logs (PrintWriters) */ protected HashMap<String, PrintWriter> diversionLogs = new HashMap<String, PrintWriter>(); /** * Truncated timestamp prefix for diversion logs; when * current time doesn't match, it's time to close all * current logs. */ protected String logGeneration = ""; protected ArrayLongFPCache cache; /** * Constructor. * @param name Name of this processor. */ public CrawlMapper() { super(); } @Override protected boolean shouldProcess(CrawlURI puri) { return true; } @Override protected void innerProcess(CrawlURI puri) { throw new AssertionError(); } @Override protected ProcessResult innerProcessResult(CrawlURI puri) { CrawlURI curi = (CrawlURI) puri; String nowGeneration = ArchiveUtils.get14DigitDate().substring(0, getRotationDigits()); if (!nowGeneration.equals(logGeneration)) { updateGeneration(nowGeneration); } if (curi.getFetchStatus() <= 0 // unfetched/unsuccessful && getCheckUri()) { // apply mapping to the CrawlURI itself String target = map(curi); if (!localName.equals(target)) { // CrawlURI is mapped to somewhere other than here curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR); curi.getAnnotations().add("to:" + target); divertLog(curi, target); return ProcessResult.FINISH; } else { // localName means keep locally; do nothing } } if (getCheckOutlinks()) { // consider outlinks for mapping Iterator<CrawlURI> iter = curi.getOutCandidates().iterator(); while (iter.hasNext()) { CrawlURI cauri = iter.next(); if (decideToMapOutlink(cauri)) { // apply mapping to the CrawlURI String target = map(cauri); if (!localName.equals(target)) { // CrawlURI is mapped to somewhere other than here iter.remove(); divertLog(cauri, target); } else { // localName means keep locally; do nothing } } } } return ProcessResult.PROCEED; } protected boolean decideToMapOutlink(CrawlURI cauri) { DecideRule rule = getOutlinkRule(); boolean rejected = rule.decisionFor(cauri).equals(DecideResult.REJECT); return !rejected; } /** * Close and mark as finished all existing diversion logs, and * arrange for new logs to use the new generation prefix. * * @param nowGeneration new generation (timestamp prefix) to use */ protected synchronized void updateGeneration(String nowGeneration) { // all existing logs are of a previous generation Iterator<PrintWriter> iter = diversionLogs.values().iterator(); while (iter.hasNext()) { FilePrintWriter writer = (FilePrintWriter) iter.next(); writer.close(); writer.getFile() .renameTo(new File(writer.getFile().getAbsolutePath().replaceFirst("\\.open$", ".divert"))); } diversionLogs.clear(); logGeneration = nowGeneration; } /** * Look up the crawler node name to which the given CrawlURI * should be mapped. * * @param cauri CrawlURI to consider * @return String node name which should handle URI */ protected abstract String map(CrawlURI cauri); /** * Note the given CrawlURI in the appropriate diversion log. * * @param cauri CrawlURI to append to a diversion log * @param target String node name (log name) to receive URI */ protected synchronized void divertLog(CrawlURI cauri, String target) { if (recentlySeen(cauri)) { return; } PrintWriter diversionLog = getDiversionLog(target); diversionLog.print(cauri.getClassKey()); diversionLog.print(" "); cauri.shortReportLineTo(diversionLog); diversionLog.println(); } /** * Consult the cache to determine if the given URI * has been recently seen -- entering it if not. * * @param cauri CrawlURI to test * @return true if URI was already in the cache; false otherwise */ private boolean recentlySeen(CrawlURI cauri) { long fp = FPGenerator.std64.fp(cauri.toString()); return !cache.add(fp); } /** * Get the diversion log for a given target crawler node node. * * @param target crawler node name of requested log * @return PrintWriter open on an appropriately-named * log file */ protected PrintWriter getDiversionLog(String target) { FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target); if (writer == null) { File divertDir = getDiversionDir().getFile(); divertDir.mkdirs(); File divertLog = new File(divertDir, logGeneration + "-" + localName + "-to-" + target + ".open"); try { writer = new FilePrintWriter(divertLog); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new RuntimeException(e); } diversionLogs.put(target, writer); } return writer; } public void start() { if (isRunning()) { return; } cache = new ArrayLongFPCache(); } public boolean isRunning() { return cache != null; } public void stop() { // XXX this happens at finish; move to teardown? cache = null; } }