Java tutorial
/* * Copyright 2009-2013 Scale Unlimited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package bixo.parser; import java.io.InputStream; import javax.xml.XMLConstants; import org.ccil.cowan.tagsoup.Parser; import org.dom4j.Document; import org.dom4j.io.SAXReader; import org.hsqldb.lib.StringInputStream; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.XMLFilterImpl; import com.scaleunlimited.cascading.NullContext; import bixo.datum.ParsedDatum; import bixo.utils.IoUtils; import cascading.flow.FlowProcess; import cascading.operation.BaseOperation; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.operation.OperationCall; import cascading.tuple.Fields; import cascading.tuple.TupleEntryCollector; @SuppressWarnings({ "serial", "rawtypes" }) public abstract class DOMParser extends BaseOperation<NullContext> implements Function<NullContext> { /** * Lowercase element names, and optionally strip out XML namespace, so that XPath can be easily * used to extract elements. * */ private static class DowngradeXmlFilter extends XMLFilterImpl { private boolean _removeNamespaces; public DowngradeXmlFilter(boolean removeNamespaces) { super(); _removeNamespaces = removeNamespaces; } @Override public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { // Always lower-case element names, for easier XPath matching String lower = localName.toLowerCase(); if (_removeNamespaces) { AttributesImpl attributes = new AttributesImpl(); for (int i = 0; i < atts.getLength(); i++) { String local = atts.getLocalName(i); String qname = atts.getQName(i); if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length()) && !local.equals(XMLConstants.XMLNS_ATTRIBUTE) && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) { attributes.addAttribute(atts.getURI(i), local, qname, atts.getType(i), atts.getValue(i)); } } super.startElement(XMLConstants.NULL_NS_URI, lower, lower, attributes); } else { super.startElement(uri, lower, lower, atts); } } @Override public void endElement(String uri, String localName, String name) throws SAXException { String lower = localName.toLowerCase(); super.endElement(XMLConstants.NULL_NS_URI, lower, lower); } } private boolean _removeNamespaces; private transient SAXReader _reader = null; private transient ParsedDatum _input; public DOMParser(Fields outputFields) { this(outputFields, true); } public DOMParser(Fields outputFields, boolean removeNamespaces) { super(outputFields); _removeNamespaces = removeNamespaces; } @Override public void prepare(FlowProcess process, OperationCall<NullContext> opCall) { super.prepare(process, opCall); _reader = new SAXReader(new Parser()); _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces)); _reader.setEncoding("UTF-8"); _input = new ParsedDatum(); } @Override public boolean isSafe() { // Parsing is computationally intensive, so we don't want to get run // multiple times. return false; } @Override public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) { _input.setTupleEntry(funcCall.getArguments()); InputStream is = new StringInputStream(_input.getParsedText()); try { Document parsedContent = _reader.read(is); process(_input, parsedContent, funcCall.getOutputCollector(), process); } catch (Exception e) { handleException(_input, e, funcCall.getOutputCollector()); } finally { IoUtils.safeClose(is); } } /** * The _input ParsedDatum was successfully converted into a Dom4J Document. * at this point you would typically emit one or more output tuples (with * appropriate fields), using the collector. * * @param datum Input datum, which wraps a Cascading Tuple. * @param doc Result of converting incoming XML document to a Dom4J Document * @param collector Collector to use if you want to emit tuples. * @param process The FlowProcess for this operation. */ protected abstract void process(ParsedDatum datum, Document doc, TupleEntryCollector collector, FlowProcess process) throws Exception; /** * An exception occurred while parsing or processing the _input ParsedDatum. Options are to * ignore it, emit a tuple (with appropriate fields), or throw a RuntimeException * to kill the job. * * @param datum Input datum, which wraps a Cascading Tuple. * @param e Exception while parsing or processing document * @param collector Collector to use if you want to emit a tuple. */ protected abstract void handleException(ParsedDatum datum, Exception e, TupleEntryCollector collector); }