com.digitalpebble.storm.crawler.parse.ParseFilters.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.storm.crawler.parse.ParseFilters.java

Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.storm.crawler.parse;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;

import com.digitalpebble.storm.crawler.util.ConfUtils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.NullNode;

/**
 * Wrapper for the ParseFilters defined in a JSON configuration
 */
public class ParseFilters extends ParseFilter {

    public static final ParseFilters emptyParseFilter = new ParseFilters();

    private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(ParseFilters.class);

    private ParseFilter[] filters;

    private ParseFilters() {
        filters = new ParseFilter[0];
    }

    /**
     * Loads and configure the ParseFilters based on the storm config if there
     * is one otherwise returns an emptyParseFilter.
     **/
    @SuppressWarnings("rawtypes")
    public static ParseFilters fromConf(Map stormConf) {
        String parseconfigfile = ConfUtils.getString(stormConf, "parsefilters.config.file");
        if (StringUtils.isNotBlank(parseconfigfile)) {
            try {
                return new ParseFilters(stormConf, parseconfigfile);
            } catch (IOException e) {
                String message = "Exception caught while loading the URLFilters from " + parseconfigfile;
                LOG.error(message);
                throw new RuntimeException(message, e);
            }
        }

        return ParseFilters.emptyParseFilter;
    }

    /**
     * loads the filters from a JSON configuration file
     * 
     * @throws IOException
     */
    @SuppressWarnings("rawtypes")
    public ParseFilters(Map stormConf, String configFile) throws IOException {
        // load the JSON configFile
        // build a JSON object out of it
        JsonNode confNode = null;
        InputStream confStream = null;
        try {
            confStream = getClass().getClassLoader().getResourceAsStream(configFile);

            ObjectMapper mapper = new ObjectMapper();
            confNode = mapper.readValue(confStream, JsonNode.class);
        } catch (Exception e) {
            throw new IOException("Unable to build JSON object from file", e);
        } finally {
            if (confStream != null) {
                confStream.close();
            }
        }

        configure(stormConf, confNode);
    }

    @SuppressWarnings("rawtypes")
    @Override
    public void configure(Map stormConf, JsonNode filtersConf) {
        // initialises the filters
        List<ParseFilter> filterLists = new ArrayList<>();

        // get the filters part
        String name = getClass().getCanonicalName();
        filtersConf = filtersConf.get(name);

        if (filtersConf == null) {
            LOG.info("No field {} in JSON config. Skipping", name);
            filters = new ParseFilter[0];
            return;
        }

        // conf node contains a list of objects
        Iterator<JsonNode> filterIter = filtersConf.elements();
        while (filterIter.hasNext()) {
            JsonNode afilterConf = filterIter.next();
            String filterName = "<unnamed>";
            JsonNode nameNode = afilterConf.get("name");
            if (nameNode != null) {
                filterName = nameNode.textValue();
            }
            JsonNode classNode = afilterConf.get("class");
            if (classNode == null) {
                LOG.error("Filter {} doesn't specified a 'class' attribute", filterName);
                continue;
            }
            String className = classNode.textValue().trim();
            filterName += '[' + className + ']';
            // check that it is available and implements the interface
            // ParseFilter
            try {
                Class<?> filterClass = Class.forName(className);
                boolean subClassOK = ParseFilter.class.isAssignableFrom(filterClass);
                if (!subClassOK) {
                    LOG.error("Filter {} does not extend ParseFilter", filterName);
                    continue;
                }
                ParseFilter filterInstance = (ParseFilter) filterClass.newInstance();

                JsonNode paramNode = afilterConf.get("params");
                if (paramNode != null) {
                    filterInstance.configure(stormConf, paramNode);
                } else {
                    // Pass in a nullNode if missing
                    filterInstance.configure(stormConf, NullNode.getInstance());
                }

                filterLists.add(filterInstance);
                LOG.info("Setup {}", filterName);
            } catch (Exception e) {
                LOG.error("Can't setup {}: {}", filterName, e);
                throw new RuntimeException("Can't setup " + filterName, e);
            }
        }

        filters = filterLists.toArray(new ParseFilter[filterLists.size()]);
    }

    @Override
    public boolean needsDOM() {
        for (ParseFilter filter : filters) {
            boolean needsDOM = filter.needsDOM();
            if (needsDOM) {
                return true;
            }
        }
        return false;
    }

    @Override
    public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {

        for (ParseFilter filter : filters) {
            long start = System.currentTimeMillis();
            if (doc == null && filter.needsDOM()) {
                LOG.info("ParseFilter {} needs DOM but has none to work on - skip : {}",
                        filter.getClass().getName(), URL);
                continue;
            }
            filter.filter(URL, content, doc, parse);
            long end = System.currentTimeMillis();
            LOG.debug("ParseFilter {} took {} msec", filter.getClass().getName(), end - start);
        }
    }

}