importer.handler.post.stages.SAXSplitter.java Source code

Java tutorial

Introduction

Here is the source code for importer.handler.post.stages.SAXSplitter.java

Source

/*
 * This file is part of Importer.
 *
 *  Importer is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  Importer is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Importer.  If not, see <http://www.gnu.org/licenses/>.
 *  (c) copyright Desmond Schmidt 2015
 */

package importer.handler.post.stages;

import importer.exception.ImporterException;
import java.io.File;
import java.io.FileInputStream;
import java.util.Map;
import java.util.HashMap;
import java.util.Stack;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.json.simple.JSONObject;
import org.json.simple.JSONArray;
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import java.io.PrintStream;
import java.io.CharArrayReader;

/**
 * SAX splitter is a simplified version-splitter for TEI-XML
 * Instead of DOM we use a linear SAX parse and record the <em>paths</em>
 * leading into each variable bit. We maintain an index where these
 * paths are mapped to layer-names, e.g. app-rdg@wit=X or subst-del
 * @author desmond
 */
public class SAXSplitter extends DefaultHandler {
    SAXParser parser;
    XMLReader xmlReader;
    int lineNo;
    StringBuilder path;
    HashSet<String> splits;
    /** equivalent elements */
    HashMap<String, String> siblings;
    /** discriminators on siblings*/
    HashMap<String, String> attributes;
    /** layer map to content */
    HashMap<String, Integer> layers;
    /** the last popped element name */
    String last;
    /** number of sibling in current set */
    int siblingCount;
    Stack<Integer> states;

    /**
     * Split a TEI-XML file into versions of XML
     * @param tei the TEI content containing versions
     * @return an analysis of the variant markup in a file
     * @throws ImportException if something went wrong
     */
    public JSONArray scan(String tei) throws ImporterException {
        this.layers = new HashMap<String, Integer>();
        this.lineNo = 1;
        this.splits = new HashSet<String>();
        this.attributes = new HashMap<String, String>();
        this.siblings = new HashMap<String, String>();
        this.states = new Stack<Integer>();
        this.states.push(0);
        this.siblingCount = 0;
        this.path = new StringBuilder();
        // hard-wire config for now
        attributes.put("add", "n");
        attributes.put("rdg", "wit");
        attributes.put("lem", "wit");
        siblings.put("add", "del");
        siblings.put("del", "add");
        siblings.put("lem", "rdg");
        siblings.put("rdg", "lem");
        splits.add("add");
        splits.add("del");
        splits.add("sic");
        splits.add("corr");
        splits.add("abbrev");
        splits.add("expan");
        splits.add("rdg");
        splits.add("lem");
        splits.add("app");
        splits.add("mod");
        splits.add("choice");
        splits.add("subst");
        try {
            SAXParserFactory spf = SAXParserFactory.newInstance();
            spf.setNamespaceAware(true);
            parser = spf.newSAXParser();
            xmlReader = parser.getXMLReader();
            xmlReader.setContentHandler(this);
            xmlReader.setErrorHandler(new MyErrorHandler(System.err));
            CharArrayReader car = new CharArrayReader(tei.toCharArray());
            xmlReader.parse(new InputSource(car));
            return layersToJson();
        } catch (Exception e) {
            throw new ImporterException(e);
        }
    }

    int compare(JSONObject a, JSONObject b) {
        String stra = (String) a.get("path");
        String strb = (String) b.get("path");
        return stra.compareTo(strb);
    }

    /**
     * Sort an array of path objects
     * @param jArr the array of path json objects
     * @return the sorted array
     */
    JSONArray sort(JSONArray jArr) {
        int increment = jArr.size() / 2;
        while (increment > 0) {
            for (int i = increment; i < jArr.size(); i++) {
                int j = i;
                JSONObject temp = (JSONObject) jArr.get(i);
                while (j >= increment && compare((JSONObject) jArr.get(j - increment), temp) > 0) {
                    jArr.set(j, jArr.get(j - increment));
                    j = j - increment;
                }
                jArr.set(j, temp);
            }
            if (increment == 2)
                increment = 1;
            else
                increment *= (5.0 / 11);
        }
        return jArr;
    }

    /**
     * Convert the layers hashmap to JSON
     * @return a JSONArray of path+line values
     */
    private JSONArray layersToJson() {
        Set<String> keys = layers.keySet();
        Iterator<String> iter = keys.iterator();
        JSONArray arr = new JSONArray();
        while (iter.hasNext()) {
            String path = iter.next();
            Integer line = layers.get(path);
            JSONObject jObj = new JSONObject();
            jObj.put("path", path);
            jObj.put("line", line);
            arr.add(jObj);
        }
        return sort(arr);
    }

    /**
     * Do we have a new element that forms part of a series?
     * @param localName the new element name
     * @return true if it is part of a series
     */
    private boolean isSibling(String localName) {
        if (last == null || !siblings.containsKey(last)) {
            if (siblings.containsKey(localName))
                return true;
        } else {
            if (localName.equals(last))
                return true;
            else {
                String lastSib = siblings.get(last);
                if (lastSib != null && lastSib.equals(localName))
                    return true;
            }
        }
        return false;
    }

    public void processingInstruction(String target, String data) throws SAXException {
        System.out.println(data);
    }

    public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
            throws SAXException {
        if (splits.contains(localName)) {
            StringBuilder component = new StringBuilder(localName);
            if (isSibling(localName))
                this.siblingCount++;
            else
                siblingCount = 0;
            if (siblingCount > 1) {
                component.append(":");
                component.append(siblingCount);
            }
            if (attributes.containsKey(localName)) {
                String attr = attributes.get(localName);
                if (atts.getValue(attr) != null) {
                    component.append("@");
                    component.append(attr);
                    component.append("=");
                    component.append(atts.getValue(attr));
                }
            }
            pushPath(component.toString());
        } else
            siblingCount = 0;
    }

    public void endElement(String uri, String localName, String qName) {
        if (splits.contains(localName))
            popPath();
        else
            siblingCount = 0;
    }

    private boolean isWhitespace(String str) {
        boolean answer = true;
        for (int i = 0; i < str.length(); i++) {
            if (!Character.isWhitespace(str.charAt(i)))
                answer = false;
            else if (str.charAt(i) == '\n') {
                lineNo++;
                if (lineNo == 13153)
                    System.out.println("13153");
            }
        }
        return answer;
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
        String str = new String(ch, start, length);
        boolean ws = isWhitespace(str);
        String pStr = path.toString();
        if (path.length() > 0 && !ws && !layers.containsKey(pStr)) {
            layers.put(pStr, lineNo);
        } else if (!ws)
            siblingCount = 0;
    }

    /**
     * Keep track of ALL newlines
     * @param ch characters data from parser
     * @param start offset into ch
     * @param length length of whitespace 
     */
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
        String str = new String(ch, start, length);
        isWhitespace(str);
    }

    /**
     * A new split element has been read
     * @param segment the path component
     */
    private void pushPath(String segment) {
        path.append("/");
        path.append(segment);
        states.push(siblingCount);
        siblingCount = 0;
    }

    private String cleanComponent(String comp) {
        int index = comp.indexOf(":");
        if (index != -1)
            comp = comp.substring(0, index);
        index = comp.indexOf("@");
        if (index != -1)
            comp = comp.substring(0, index);
        return comp;
    }

    /**
     * At split-element end pop the current path
     */
    private void popPath() {
        int index = path.lastIndexOf("/");
        if (index != -1) {
            this.last = cleanComponent(path.substring(index + 1));
            path.setLength(index);
        }
        if (states.size() > 0)
            siblingCount = states.pop();
    }

    /**
     * Basic error handler with line-number counter
     */
    private static class MyErrorHandler implements ErrorHandler {
        private PrintStream out;

        MyErrorHandler(PrintStream out) {
            this.out = out;
        }

        private String getParseExceptionInfo(SAXParseException spe) {
            String systemId = spe.getSystemId();

            if (systemId == null) {
                systemId = "null";
            }

            String info = "URI=" + systemId + " Line=" + spe.getLineNumber() + ": " + spe.getMessage();

            return info;
        }

        public void warning(SAXParseException spe) throws SAXException {
            out.println("Warning: " + getParseExceptionInfo(spe));
        }

        public void error(SAXParseException spe) throws SAXException {
            String message = "Error: " + getParseExceptionInfo(spe);
            throw new SAXException(message);
        }

        public void fatalError(SAXParseException spe) throws SAXException {
            String message = "Fatal Error: " + getParseExceptionInfo(spe);
            throw new SAXException(message);
        }
    }

    public static void main(String[] args) {
        if (args.length == 1) {
            File f = new File(args[0]);
            byte[] data = new byte[(int) f.length()];
            try {
                FileInputStream fis = new FileInputStream(f);
                fis.read(data);
                SAXSplitter ss = new SAXSplitter();
                JSONArray jArr = ss.scan(new String(data, "UTF-8"));
                String jStr = jArr.toJSONString();
                System.out.println(jStr.replaceAll("\\\\/", "/"));
            } catch (Exception e) {
                System.out.println(e.getMessage());
            }
        }
    }
}