org.rdswicthboard.utils.rdf.oai.App.java Source code

Java tutorial

Introduction

Here is the source code for org.rdswicthboard.utils.rdf.oai.App.java

Source

/*******************************************************************************
 * Copyright (c) 2016 RD-Switchboard and others.
 *
 * This file is part of RD-Switchboard.
 * 
 * RD-Switchboard is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 *
 * Contributors:
 *     Dima Kudriavcev - https://github.com/wizman777
 *******************************************************************************/

package org.rdswicthboard.utils.rdf.oai;

import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.configuration.CompositeConfiguration;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class App {

    private static final String PROPERTIES_FILE = "rdf2oai.cfg";

    private static final String PROPERTY_INPUT_FILE = "input-file";
    private static final String PROPERTY_OUTPUT_FILE = "output-file";
    private static final String PROPERTY_CONFIG_FILE = "config-file";
    private static final String PROPERTY_INPUT_ENCODING = "input-encoding";
    private static final String PROPERTY_OUTPUT_ENCODING = "output-encoding";
    private static final String PROPERTY_SET_SPEC = "set-spec";
    private static final String PROPERTY_FORMAT_OUTPUT = "format-output";
    private static final String PROPERTY_HELP = "help";

    private static final String DEFAULT_OUTPUT_FILE = "output.xml";
    private static final String DEFAULT_ENCODING = StandardCharsets.UTF_8.toString();
    private static final String DEFAULT_SET_SPEC = "RDF";
    private static final String DEFAULT_FORMAT_OUTPUT = "no";

    private static final String TIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";

    private static final String RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
    private static final String RNF_NAMESPACE = "http://rns.nii.ac.jp/ns#";

    public static void main(String[] args) {
        // create the command line parser
        CommandLineParser parser = new DefaultParser();

        // create the Options
        Options options = new Options();
        options.addOption("i", PROPERTY_INPUT_FILE, true, "input RDF file");
        options.addOption("o", PROPERTY_OUTPUT_FILE, true,
                "output OAI-PMH XML file (default is " + DEFAULT_OUTPUT_FILE + ")");
        options.addOption("c", PROPERTY_CONFIG_FILE, true, "configuration file (" + PROPERTIES_FILE + ")");
        options.addOption("s", PROPERTY_SET_SPEC, true, "set spec value (default is " + DEFAULT_SET_SPEC + ")");
        options.addOption("I", PROPERTY_INPUT_ENCODING, true,
                "input file encoding (default is " + DEFAULT_ENCODING + ")");
        options.addOption("O", PROPERTY_OUTPUT_ENCODING, true,
                "output file encoding (default is " + DEFAULT_ENCODING + ")");
        options.addOption("f", PROPERTY_FORMAT_OUTPUT, false, "format output encoding");
        options.addOption("h", PROPERTY_HELP, false, "print this message");

        try {
            // parse the command line arguments
            CommandLine line = parser.parse(options, args);

            if (line.hasOption(PROPERTY_HELP)) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("java -jar rdf2oai-[verion].jar [PARAMETERS] [INPUT FILE] [OUTPUT FILE]",
                        options);

                System.exit(0);
            }

            // variables to store program properties
            CompositeConfiguration config = new CompositeConfiguration();
            config.setProperty(PROPERTY_OUTPUT_FILE, DEFAULT_OUTPUT_FILE);
            config.setProperty(PROPERTY_INPUT_ENCODING, DEFAULT_ENCODING);
            config.setProperty(PROPERTY_OUTPUT_ENCODING, DEFAULT_ENCODING);
            config.setProperty(PROPERTY_SET_SPEC, DEFAULT_SET_SPEC);
            config.setProperty(PROPERTY_FORMAT_OUTPUT, DEFAULT_FORMAT_OUTPUT);

            // check if arguments has input file properties 
            if (line.hasOption(PROPERTY_CONFIG_FILE)) {
                // if it does, load the specified configuration file
                Path defaultConfig = Paths.get(line.getOptionValue(PROPERTY_CONFIG_FILE));
                if (Files.isRegularFile(defaultConfig) && Files.isReadable(defaultConfig)) {
                    config.addConfiguration(new PropertiesConfiguration(defaultConfig.toFile()));
                } else
                    throw new Exception("Invalid configuration file: " + defaultConfig.toString());
            } else {
                // if it not, try to load default configurationfile
                Path defaultConfig = Paths.get(PROPERTIES_FILE);
                if (Files.isRegularFile(defaultConfig) && Files.isReadable(defaultConfig)) {
                    config.addConfiguration(new PropertiesConfiguration(defaultConfig.toFile()));
                }
            }

            // check if arguments has input file 
            if (line.hasOption(PROPERTY_INPUT_FILE))
                config.setProperty(PROPERTY_INPUT_FILE, line.getOptionValue(PROPERTY_INPUT_FILE));

            // check if arguments has output file
            if (line.hasOption(PROPERTY_OUTPUT_FILE))
                config.setProperty(PROPERTY_OUTPUT_FILE, line.getOptionValue(PROPERTY_OUTPUT_FILE));

            // check if arguments has set spec name
            if (line.hasOption(PROPERTY_SET_SPEC))
                config.setProperty(PROPERTY_SET_SPEC, line.getOptionValue(PROPERTY_SET_SPEC));

            // check if arguments has input encoding
            if (line.hasOption(PROPERTY_INPUT_ENCODING))
                config.setProperty(PROPERTY_INPUT_ENCODING, line.getOptionValue(PROPERTY_INPUT_ENCODING));

            // check if arguments has output encoding
            if (line.hasOption(PROPERTY_OUTPUT_ENCODING))
                config.setProperty(PROPERTY_OUTPUT_ENCODING, line.getOptionValue(PROPERTY_OUTPUT_ENCODING));

            // check if arguments has output encoding
            if (line.hasOption(PROPERTY_FORMAT_OUTPUT))
                config.setProperty(PROPERTY_FORMAT_OUTPUT, "yes");

            // check if arguments has input file without a key
            if (line.getArgs().length > 0) {
                config.setProperty(PROPERTY_INPUT_FILE, line.getArgs()[0]);

                // check if arguments has output file without a key
                if (line.getArgs().length > 1) {
                    config.setProperty(PROPERTY_OUTPUT_FILE, line.getArgs()[1]);

                    // check if there is too many arguments
                    if (line.getArgs().length > 2)
                        throw new Exception("Too many arguments");
                }
            }

            // The program has default output file, but input file must be presented
            if (!config.containsKey(PROPERTY_INPUT_FILE))
                throw new Exception("Please specify input file");

            // extract input file
            String inputFile = config.getString(PROPERTY_INPUT_FILE);

            // extract output file
            String outputFile = config.getString(PROPERTY_OUTPUT_FILE);

            // extract set spec
            String setSpecName = config.getString(PROPERTY_SET_SPEC);

            // extract encoding
            String inputEncoding = config.getString(PROPERTY_INPUT_ENCODING);
            String outputEncoding = config.getString(PROPERTY_OUTPUT_ENCODING);

            boolean formatOutput = config.getBoolean(PROPERTY_FORMAT_OUTPUT);

            // test if source is an regular file and it is readable
            Path source = Paths.get(inputFile);
            if (!Files.isRegularFile(source))
                throw new Exception("The input file: " + source.toString() + " is not an regular file");
            if (!Files.isReadable(source))
                throw new Exception("The input file: " + source.toString() + " is not readable");

            Path target = Paths.get(outputFile);
            if (Files.exists(target)) {
                if (!Files.isRegularFile(target))
                    throw new Exception("The output file: " + target.toString() + " is not an regular file");
                if (!Files.isWritable(target))
                    throw new Exception("The output file: " + target.toString() + " is not writable");
            }

            // create and setup document builder factory
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true);

            // create new document builder
            DocumentBuilder builder = factory.newDocumentBuilder();

            // create oai document
            Document oai = builder.newDocument();

            // set document version
            oai.setXmlVersion("1.0");
            oai.setXmlStandalone(true);

            // create root OAI-PMH element
            Element oaiPmh = oai.createElement("OAI-PMH");

            // set document namespaces
            oaiPmh.setAttribute("xmlns", "http://www.openarchives.org/OAI/2.0/");
            oaiPmh.setAttributeNS("http://www.w3.org/2000/xmlns/", "xmlns:xsi",
                    "http://www.w3.org/2001/XMLSchema-instance");
            oaiPmh.setAttribute("xsi:schemaLocation",
                    "http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd");

            // append root node
            oai.appendChild(oaiPmh);

            // create responseDate element
            Element responseDate = oai.createElement("responseDate");

            // create simple date format
            DateFormat dateFormat = new SimpleDateFormat(TIME_FORMAT);

            // generate date
            String date = dateFormat.format(new Date());

            // set current date and time
            responseDate.setTextContent(date);

            oaiPmh.appendChild(responseDate);

            Element listRecords = oai.createElement("ListRecords");
            oaiPmh.appendChild(listRecords);

            // create xpath factory
            XPathFactory xPathfactory = XPathFactory.newInstance();

            // create namespace context
            NamespaceContext namespaceContext = new NamespaceContext() {
                public String getNamespaceURI(String prefix) {
                    if (prefix.equals("rdf"))
                        return RDF_NAMESPACE;
                    else if (prefix.equals("rns"))
                        return RNF_NAMESPACE;
                    else
                        return null;
                }

                @Override
                public Iterator<?> getPrefixes(String val) {
                    throw new IllegalAccessError("Not implemented!");
                }

                @Override
                public String getPrefix(String uri) {
                    throw new IllegalAccessError("Not implemented!");
                }
            };

            // create xpath object
            XPath xpath = xPathfactory.newXPath();
            // set namespace contex
            xpath.setNamespaceContext(namespaceContext);

            // create XPath expressions
            XPathExpression idExpr = xpath.compile("/rdf:RDF/rns:Researcher/@rdf:about");
            XPathExpression emptyExpr = xpath.compile("//text()[normalize-space(.) = '']");

            // create RegEx patterns  
            Pattern pattern = Pattern.compile(
                    "<\\?xml\\s+version=\"[\\d\\.]+\"\\s*\\?>\\s*<\\s*rdf:RDF[^>]*>[\\s\\S]*?<\\s*\\/\\s*rdf:RDF\\s*>");

            // read file into a string
            String content = new String(Files.readAllBytes(source), inputEncoding);

            Matcher matcher = pattern.matcher(content);
            // process all records
            while (matcher.find()) {
                // convert string to input stream
                ByteArrayInputStream input = new ByteArrayInputStream(
                        matcher.group().getBytes(StandardCharsets.UTF_8.toString()));

                // parse the xml document
                Document doc = builder.parse(input);

                // remove all spaces
                NodeList emptyNodes = (NodeList) emptyExpr.evaluate(doc, XPathConstants.NODESET);
                // Remove each empty text node from document.
                for (int i = 0; i < emptyNodes.getLength(); i++) {
                    Node emptyTextNode = emptyNodes.item(i);
                    emptyTextNode.getParentNode().removeChild(emptyTextNode);
                }

                // obtain researcher id
                String id = (String) idExpr.evaluate(doc, XPathConstants.STRING);
                if (StringUtils.isEmpty(id))
                    throw new Exception("The record identifier can not be empty");

                // create record element
                Element record = oai.createElement("record");
                listRecords.appendChild(record);

                // create header element
                Element header = oai.createElement("header");
                record.appendChild(header);

                // create identifier element
                Element identifier = oai.createElement("identifier");
                identifier.setTextContent(id);
                header.appendChild(identifier);

                // create datestamp element
                Element datestamp = oai.createElement("datestamp");
                datestamp.setTextContent(date);
                header.appendChild(datestamp);

                // create set spec element if it exists
                if (!StringUtils.isEmpty(setSpecName)) {
                    Element setSpec = oai.createElement("setSpec");
                    setSpec.setTextContent(setSpecName);
                    header.appendChild(setSpec);
                }

                // create metadata element
                Element metadata = oai.createElement("metadata");
                record.appendChild(metadata);

                // import the record
                metadata.appendChild(oai.importNode(doc.getDocumentElement(), true));
            }

            // create transformer factory
            TransformerFactory transformerFactory = TransformerFactory.newInstance();

            // create transformer
            Transformer transformer = transformerFactory.newTransformer();
            transformer.setOutputProperty(OutputKeys.ENCODING, outputEncoding);

            if (formatOutput) {
                transformer.setOutputProperty(OutputKeys.INDENT, "yes");
                transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
            } else
                transformer.setOutputProperty(OutputKeys.INDENT, "no");

            // create dom source
            DOMSource oaiSource = new DOMSource(oai);

            // create stream result
            StreamResult result = new StreamResult(target.toFile());

            // stream xml to file
            transformer.transform(oaiSource, result);

            // optional stream xml to console for testing
            //StreamResult consoleResult = new StreamResult(System.out);
            //transformer.transform(oaiSource, consoleResult);

        } catch (Exception e) {
            System.err.println("Error: " + e.getMessage());

            //e.printStackTrace();

            System.exit(1);
        }
    }
}