com.digitalpebble.storm.crawler.parse.filter.XPathFilter.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.storm.crawler.parse.filter.XPathFilter.java

Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.storm.crawler.parse.filter;

import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import javax.xml.namespace.QName;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.lang.StringUtils;
import org.apache.xml.serialize.Method;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.parse.ParseData;
import com.digitalpebble.storm.crawler.parse.ParseFilter;
import com.digitalpebble.storm.crawler.parse.ParseResult;
import com.fasterxml.jackson.databind.JsonNode;

/**
 * Simple ParseFilter to illustrate and test the interface. Reads a XPATH
 * pattern from the config file and stores the value as metadata
 */
public class XPathFilter extends ParseFilter {

    private enum EvalFunction {

        NONE, STRING, SERIALIZE;

        public QName getReturnType() {
            switch (this) {
            case STRING:
                return XPathConstants.STRING;
            default:
                return XPathConstants.NODESET;
            }
        }
    }

    private static final Logger LOG = LoggerFactory.getLogger(XPathFilter.class);

    private XPathFactory factory = XPathFactory.newInstance();
    private XPath xpath = factory.newXPath();

    private final Map<String, List<LabelledExpression>> expressions = new HashMap<String, List<LabelledExpression>>();

    private class LabelledExpression {

        private String key;
        private EvalFunction evalFunction;
        private XPathExpression expression;

        private LabelledExpression(String key, String expression) throws XPathExpressionException {
            this.key = key;
            if (expression.startsWith("string(")) {
                evalFunction = EvalFunction.STRING;
            } else if (expression.startsWith("serialize(")) {
                expression = expression.substring(10, expression.length() - 1);
                evalFunction = EvalFunction.SERIALIZE;
            } else {
                evalFunction = EvalFunction.NONE;
            }
            this.expression = xpath.compile(expression);
        }

        private List<String> evaluate(DocumentFragment doc) throws XPathExpressionException, IOException {
            Object evalResult = expression.evaluate(doc, evalFunction.getReturnType());
            List<String> values = new LinkedList<String>();
            switch (evalFunction) {
            case STRING:
                if (evalResult != null) {
                    String strippedValue = StringUtils.strip((String) evalResult);
                    values.add(strippedValue);
                }
                break;
            case SERIALIZE:
                NodeList nodesToSerialize = (NodeList) evalResult;
                StringWriter out = new StringWriter();
                OutputFormat format = new OutputFormat(Method.XHTML, null, false);
                format.setOmitXMLDeclaration(true);
                XMLSerializer serializer = new XMLSerializer(out, format);
                for (int i = 0; i < nodesToSerialize.getLength(); i++) {
                    Node node = nodesToSerialize.item(i);
                    switch (node.getNodeType()) {
                    case Node.ELEMENT_NODE:
                        serializer.serialize((Element) node);
                        break;
                    case Node.DOCUMENT_NODE:
                        serializer.serialize((Document) node);
                        break;
                    case Node.DOCUMENT_FRAGMENT_NODE:
                        serializer.serialize((DocumentFragment) node);
                        break;
                    case Node.TEXT_NODE:
                        String text = node.getTextContent();
                        if (text.length() > 0) {
                            values.add(text);
                        }
                        // By pass the rest of the code since it is used to
                        // extract
                        // the value out of the serialized which isn't used in
                        // this case
                        continue;
                    }
                    String serializedValue = out.toString();
                    if (serializedValue.length() > 0) {
                        values.add(serializedValue);
                    }
                    out.getBuffer().setLength(0);
                }
                break;
            default:
                NodeList nodes = (NodeList) evalResult;
                for (int i = 0; i < nodes.getLength(); i++) {
                    Node node = nodes.item(i);
                    values.add(StringUtils.strip(node.getTextContent()));
                }
            }
            return values;
        }
    }

    @Override
    public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {

        ParseData parseData = parse.get(URL);
        Metadata metadata = parseData.getMetadata();

        // applies the XPATH expression in the order in which they are produced
        java.util.Iterator<List<LabelledExpression>> iter = expressions.values().iterator();
        while (iter.hasNext()) {
            List<LabelledExpression> leList = iter.next();
            for (LabelledExpression le : leList) {
                try {
                    List<String> values = le.evaluate(doc);
                    if (values != null && !values.isEmpty()) {
                        metadata.addValues(le.key, values);
                        break;
                    }
                } catch (XPathExpressionException e) {
                    LOG.error("Error evaluating {}: {}", le.key, e);
                } catch (IOException e) {
                    LOG.error("Error evaluating {}: {}", le.key, e);
                }
            }
        }
    }

    @SuppressWarnings("rawtypes")
    @Override
    public void configure(Map stormConf, JsonNode filterParams) {
        java.util.Iterator<Entry<String, JsonNode>> iter = filterParams.fields();
        while (iter.hasNext()) {
            Entry<String, JsonNode> entry = iter.next();
            String key = entry.getKey();
            JsonNode node = entry.getValue();
            if (node.isArray()) {
                for (JsonNode expression : node) {
                    addExpression(key, expression);
                }
            } else {
                addExpression(key, entry.getValue());
            }
        }
    }

    private void addExpression(String key, JsonNode expression) {
        String xpathvalue = expression.asText();
        try {
            List<LabelledExpression> lexpressionList = expressions.get(key);
            if (lexpressionList == null) {
                lexpressionList = new ArrayList<>();
                expressions.put(key, lexpressionList);
            }
            LabelledExpression lexpression = new LabelledExpression(key, xpathvalue);
            lexpressionList.add(lexpression);
        } catch (XPathExpressionException e) {
            throw new RuntimeException("Can't compile expression : " + xpathvalue, e);
        }
    }

    @Override
    public boolean needsDOM() {
        return true;
    }

}