bixo.parser.DOMParser.java Source code

Introduction

Here is the source code for bixo.parser.DOMParser.java
Source

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.parser;

import java.io.InputStream;

import javax.xml.XMLConstants;

import org.ccil.cowan.tagsoup.Parser;
import org.dom4j.Document;
import org.dom4j.io.SAXReader;
import org.hsqldb.lib.StringInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLFilterImpl;

import com.scaleunlimited.cascading.NullContext;

import bixo.datum.ParsedDatum;
import bixo.utils.IoUtils;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntryCollector;

@SuppressWarnings({ "serial", "rawtypes" })
public abstract class DOMParser extends BaseOperation<NullContext> implements Function<NullContext> {

    /**
     * Lowercase element names, and optionally strip out XML namespace, so that XPath can be easily 
     * used to extract elements.
     *
     */
    private static class DowngradeXmlFilter extends XMLFilterImpl {

        private boolean _removeNamespaces;

        public DowngradeXmlFilter(boolean removeNamespaces) {
            super();

            _removeNamespaces = removeNamespaces;
        }

        @Override
        public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
            // Always lower-case element names, for easier XPath matching
            String lower = localName.toLowerCase();

            if (_removeNamespaces) {
                AttributesImpl attributes = new AttributesImpl();
                for (int i = 0; i < atts.getLength(); i++) {
                    String local = atts.getLocalName(i);
                    String qname = atts.getQName(i);
                    if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
                            && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
                            && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
                        attributes.addAttribute(atts.getURI(i), local, qname, atts.getType(i), atts.getValue(i));
                    }
                }

                super.startElement(XMLConstants.NULL_NS_URI, lower, lower, attributes);
            } else {
                super.startElement(uri, lower, lower, atts);
            }
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            String lower = localName.toLowerCase();
            super.endElement(XMLConstants.NULL_NS_URI, lower, lower);
        }
    }

    private boolean _removeNamespaces;

    private transient SAXReader _reader = null;
    private transient ParsedDatum _input;

    public DOMParser(Fields outputFields) {
        this(outputFields, true);
    }

    public DOMParser(Fields outputFields, boolean removeNamespaces) {
        super(outputFields);

        _removeNamespaces = removeNamespaces;
    }

    @Override
    public void prepare(FlowProcess process, OperationCall<NullContext> opCall) {
        super.prepare(process, opCall);

        _reader = new SAXReader(new Parser());
        _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces));
        _reader.setEncoding("UTF-8");
        _input = new ParsedDatum();
    }

    @Override
    public boolean isSafe() {
        // Parsing is computationally intensive, so we don't want to get run
        // multiple times.
        return false;
    }

    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
        _input.setTupleEntry(funcCall.getArguments());
        InputStream is = new StringInputStream(_input.getParsedText());

        try {
            Document parsedContent = _reader.read(is);
            process(_input, parsedContent, funcCall.getOutputCollector(), process);
        } catch (Exception e) {
            handleException(_input, e, funcCall.getOutputCollector());
        } finally {
            IoUtils.safeClose(is);
        }

    }

    /**
     * The _input ParsedDatum was successfully converted into a Dom4J Document.
     * at this point you would typically emit one or more output tuples (with
     * appropriate fields), using the collector.
     * 
     * @param datum Input datum, which wraps a Cascading Tuple.
     * @param doc Result of converting incoming XML document to a Dom4J Document
     * @param collector Collector to use if you want to emit tuples.
     * @param process The FlowProcess for this operation.
     */
    protected abstract void process(ParsedDatum datum, Document doc, TupleEntryCollector collector,
            FlowProcess process) throws Exception;

    /**
     * An exception occurred while parsing or processing the _input ParsedDatum. Options are to
     * ignore it, emit a tuple (with appropriate fields), or throw a RuntimeException
     * to kill the job.
     * 
     * @param datum Input datum, which wraps a Cascading Tuple.
     * @param e Exception while parsing or processing document
     * @param collector Collector to use if you want to emit a tuple.
     */
    protected abstract void handleException(ParsedDatum datum, Exception e, TupleEntryCollector collector);
}