org.archive.modules.deciderules.surt.SurtPrefixedDecideRule.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.modules.deciderules.surt.SurtPrefixedDecideRule.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.deciderules.surt;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.io.ReadSource;
import org.archive.modules.CrawlURI;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.PredicatedDecideRule;
import org.archive.modules.seeds.SeedListener;
import org.archive.modules.seeds.SeedModule;
import org.archive.net.UURI;
import org.archive.spring.ConfigFile;
import org.archive.util.SurtPrefixSet;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.event.ContextStartedEvent;

/**
 * Rule applies configured decision to any URIs that, when 
 * expressed in SURT form, begin with one of the prefixes
 * in the configured set. 
 * 
 * The set can be filled with SURT prefixes implied or
 * listed in the seeds file, or another external file. 
 *
 * The "also-check-via" option to implement "one hop off" 
 * scoping derives from a contribution by Shifra Raffel
 * of the California Digital Library. 
 * 
 * @author gojomo
 */
public class SurtPrefixedDecideRule extends PredicatedDecideRule
        implements SeedListener, ApplicationListener<ApplicationEvent>, Checkpointable, BeanNameAware {

    private static final long serialVersionUID = 3L;
    private static final Logger logger = Logger.getLogger(SurtPrefixedDecideRule.class.getName());

    /**
     * Source file from which to infer SURT prefixes. Any URLs in file will be
     * converted to the implied SURT prefix, and literal SURT prefixes may be
     * listed on lines beginning with a '+' character.
     * 
     * @deprecated redundant now that we have
     *             {@link SurtPrefixedDecideRule#surtsSource}
     */
    public ConfigFile getSurtsSourceFile() {
        if (getSurtsSource() instanceof ConfigFile) {
            return (ConfigFile) getSurtsSource();
        } else {
            return null;
        }
    }

    /** @deprecated */
    public void setSurtsSourceFile(ConfigFile cp) {
        setSurtsSource(cp);
    }

    /**
     * Text from which to infer SURT prefixes. Any URLs will be converted to the
     * implied SURT prefix, and literal SURT prefixes may be listed on lines
     * beginning with a '+' character.
     */
    protected ReadSource surtsSource = null;

    public ReadSource getSurtsSource() {
        return surtsSource;
    }

    public void setSurtsSource(ReadSource surtsSource) {
        this.surtsSource = surtsSource;
    }

    /**
     * Should seeds also be interpreted as SURT prefixes.
     */
    protected boolean seedsAsSurtPrefixes = true;

    public boolean getSeedsAsSurtPrefixes() {
        return seedsAsSurtPrefixes;
    }

    public void setSeedsAsSurtPrefixes(boolean seedsAsSurtPrefixes) {
        this.seedsAsSurtPrefixes = seedsAsSurtPrefixes;
    }

    /**
     * Dump file to save SURT prefixes actually used: Useful debugging SURTs.
     */
    protected ConfigFile surtsDumpFile = new ConfigFile("surtsDumpFile", "${launchId}/surts.dump");

    public ConfigFile getSurtsDumpFile() {
        return surtsDumpFile;
    }

    public void setSurtsDumpFile(ConfigFile cp) {
        this.surtsDumpFile.merge(cp);
    }

    /**
     * Whether to also make the configured decision if a URI's 'via' URI (the
     * URI from which it was discovered) in SURT form begins with any of the
     * established prefixes. For example, can be used to ACCEPT URIs that are
     * 'one hop off' URIs fitting the SURT prefixes. Default is false.
     */
    {
        setAlsoCheckVia(false);
    }

    public boolean getAlsoCheckVia() {
        return (Boolean) kp.get("alsoCheckVia");
    }

    public void setAlsoCheckVia(boolean checkVia) {
        kp.put("alsoCheckVia", checkVia);
    }

    protected SeedModule seeds;

    public SeedModule getSeeds() {
        return this.seeds;
    }

    @Autowired
    public void setSeeds(SeedModule seeds) {
        this.seeds = seeds;
        if (seeds != null) {
            // in case this bean wasn't autowired to listeners (as if an
            // inner bean)
            seeds.addSeedListener(this);
        }
    }

    protected SurtPrefixSet surtPrefixes = new SurtPrefixSet();

    public SurtPrefixedDecideRule() {
    }

    public void concludedSeedBatch() {
        dumpSurtPrefixSet();
    }

    /**
     * Evaluate whether given object's URI is covered by the SURT prefix set
     * 
     * @param object Item to evaluate.
     * @return true if item, as SURT form URI, is prefixed by an item in the set
     */
    @Override
    protected boolean evaluate(CrawlURI uri) {
        if (getAlsoCheckVia()) {
            if (innerDecide(uri.getVia())) {
                return true;
            }
        }
        return innerDecide(uri.getUURI());
    }

    private boolean innerDecide(UURI uuri) {
        String candidateSurt;
        candidateSurt = SurtPrefixSet.getCandidateSurt(uuri);
        if (candidateSurt == null) {
            return false;
        }
        if (surtPrefixes.containsPrefixOf(candidateSurt)) {
            return true;
        } else {
            return false;
        }
    }

    protected void readPrefixes() {
        buildSurtPrefixSet();
    }

    /**
     * Dump the current prefixes in use to configured dump file (if any)
     */
    protected void dumpSurtPrefixSet() {
        // dump surts to file, if appropriate
        String dumpPath = getSurtsDumpFile().getPath();
        if (!StringUtils.isEmpty(dumpPath)) {
            File dump = getSurtsDumpFile().getFile();
            try {
                FileWriter fw = new FileWriter(dump);
                try {
                    surtPrefixes.exportTo(fw);
                } finally {
                    fw.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
                throw new RuntimeException(e);
            }
        }
    }

    /**
     * Construct the set of prefixes to use, from the seed list (
     * which may include both URIs and '+'-prefixed directives).
     */
    protected void buildSurtPrefixSet() {
        if (getSurtsSource() != null) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("reading surt prefixes from " + getSurtsSource());
            }
            Reader reader = getSurtsSource().obtainReader();
            try {
                surtPrefixes.importFromMixed(reader, true);
            } finally {
                IOUtils.closeQuietly(reader);
            }
        }
    }

    /**
     * If appropriate, convert seed notification into prefix-addition.
     * 
     * @see org.archive.modules.seeds.SeedListener#addedSeed(org.archive.modules.CrawlURI)
     */
    public void addedSeed(final CrawlURI curi) {
        if (getSeedsAsSurtPrefixes()) {
            surtPrefixes.add(prefixFrom(curi.getURI()));
        }
    }

    protected String prefixFrom(String uri) {
        return SurtPrefixSet.prefixFromPlainForceHttp(uri);
    }

    private static final String DEFAULT_ACCEPT_ADD_PREFIX_DIRECTIVE = "+";
    private static final String DEFAULT_REJECT_ADD_PREFIX_DIRECTIVE = "-";

    /** 
     * Consider nonseed lines as possible SURT prefix directives. If we 
     * are ACCEPTing URIs, expect '+' as the add-SURT directive. If we
     * are REJECTing URIs, expect '-' as the add-SURT directive. 
     * 
     * TODO: perhaps, make directive configurable, so many SeedListener
     * SurtPrefixedDecideRules can all listen for directives meant only 
     * for them. 
     * 
     * @see org.archive.modules.seeds.SeedListener#nonseedLine(java.lang.String)
     */
    public boolean nonseedLine(String line) {
        String effectiveDirective = getEffectiveAddDirective();
        if (line.startsWith(effectiveDirective)) {
            return surtPrefixes.considerAsAddDirective(line.substring(effectiveDirective.length()));
        } else {
            // not a line this instance is interested in
            return false;
        }
    }

    private String getEffectiveAddDirective() {
        if (getDecision() == DecideResult.ACCEPT) {
            return DEFAULT_ACCEPT_ADD_PREFIX_DIRECTIVE;
        }
        if (getDecision() == DecideResult.REJECT) {
            return DEFAULT_REJECT_ADD_PREFIX_DIRECTIVE;
        }
        throw new IllegalArgumentException("decision must be ACCEPT or REJECT");
    }

    @Override
    public void onApplicationEvent(ApplicationEvent event) {
        if (event instanceof ContextStartedEvent) {
            if (recoveryCheckpoint != null) {
                JSONObject json = recoveryCheckpoint.loadJson(beanName);
                try {
                    JSONArray jsonArray = json.getJSONArray("surtPrefixes");
                    for (int i = 0; i < jsonArray.length(); i++) {
                        surtPrefixes.add(jsonArray.getString(i));
                    }
                } catch (JSONException e) {
                    throw new IllegalStateException(e);
                }
            } else {
                readPrefixes();
            }
        }
    }

    // BeanNameAware
    protected String beanName;

    public void setBeanName(String name) {
        this.beanName = name;
    }

    @Override
    public void startCheckpoint(Checkpoint checkpointInProgress) {
    }

    @Override
    public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
        try {
            JSONObject json = new JSONObject();
            json.put("surtPrefixes", surtPrefixes);
            checkpointInProgress.saveJson(beanName, json);
        } catch (JSONException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void finishCheckpoint(Checkpoint checkpointInProgress) {
    }

    protected Checkpoint recoveryCheckpoint;

    @Override
    public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
        this.recoveryCheckpoint = recoveryCheckpoint;
    }

}//EOC