org.n52.lod.csw.CSWLoDEnabler.java Source code

Java tutorial

Introduction

Here is the source code for org.n52.lod.csw.CSWLoDEnabler.java

Source

/**
 * Copyright (C) 2013-2014 52North Initiative for Geospatial Open Source
 * Software GmbH
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 *
 * If the program is linked with libraries which are licensed under one of
 * the following licenses, the combination of the program with the linked
 * library is not considered a "derivative work" of the program:
 *
 *     - Apache License, version 2.0
 *     - Apache Software License, version 1.0
 *     - GNU Lesser General Public License, version 3
 *     - Mozilla Public License, versions 1.0, 1.1 and 2.0
 *     - Common Development and Distribution License (CDDL), version 1.0
 *
 * Therefore the distribution of the program linked with libraries licensed
 * under the aforementioned licenses, is permitted by the copyright holders
 * if the distribution is compliant with both the GNU General Public
 * License version 2 and the aforementioned licenses.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 * Public License for more details.
 */
package org.n52.lod.csw;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import net.opengis.cat.csw.x202.AbstractRecordType;
import net.opengis.cat.csw.x202.BriefRecordType;
import net.opengis.cat.csw.x202.GetRecordByIdResponseDocument;
import net.opengis.cat.csw.x202.GetRecordsResponseDocument;
import net.opengis.cat.csw.x202.SearchResultsType;

import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlOptions;
import org.n52.lod.Configuration;
import org.n52.lod.Report;
import org.n52.lod.csw.mapping.GluesMapper;
import org.n52.lod.csw.mapping.WKTLiteralType;
import org.n52.lod.csw.mapping.XmlToRdfMapper;
import org.n52.lod.triplestore.FileTripleSink;
import org.n52.lod.triplestore.TripleSink;
import org.n52.lod.triplestore.VirtuosoServer;
import org.n52.oxf.OXFException;
import org.n52.oxf.ows.ExceptionReport;
import org.n52.oxf.util.web.HttpClientException;
import org.purl.dc.elements.x11.SimpleLiteral;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Stopwatch;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Queues;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.hp.hpl.jena.datatypes.TypeMapper;

/**
 * This is the main class. It allows to start the application and to execute the
 * process of feeding all records of a CSW into an LoD triple store.
 * 
 * @author <a href="mailto:broering@52north.org">Arne Broering</a>
 */
public class CSWLoDEnabler {

    private static final Logger log = LoggerFactory.getLogger(CSWLoDEnabler.class);

    private static final int NUMBER_OF_RECORDS_PER_ITERATION = 25;

    private static final long FALLBACK_RECORDS_TOTAL = 10000;

    static XmlOptions xmlOptions;

    static {
        xmlOptions = new XmlOptions();
        xmlOptions.setCharacterEncoding("UTF-8");
    }

    protected boolean addToServer = false;

    protected boolean saveToFile = false;

    Configuration config = null;

    private CatalogInteractor csw;

    protected Report report = new Report();

    protected final Map<String, GetRecordByIdResponseDocument> POISON_PILL = Maps.newHashMap();

    public CSWLoDEnabler(Configuration config) {

        WKTLiteralType type = WKTLiteralType.theWKTLiteralType;

        TypeMapper.getInstance().registerDatatype(type);

        POISON_PILL.put("poison", null);

        addToServer = config.isAddToServer();
        saveToFile = config.isSaveToFile();

        this.config = config;
        this.csw = new CatalogInteractor(config);
        log.info("NEW {}", this);
    }

    public static void main(String[] args) {
        try {
            CSWLoDEnabler enabler = new CSWLoDEnabler(new Configuration(Configuration.DEFAULT_CONFIG_FILE));
            enabler.runOverAll();
        } catch (RuntimeException | IOException e) {
            log.error("Error running CSW to LOD", e);
        }
    }

    public void runOverAll() throws IOException {
        runStartingFrom(1);
    }

    /**
     * executes the program: 1.) retrieves the record descriptions from the CSW
     * 2.) transforms the descriptions to RDF 3.) inserts the produced RDF into
     * the triplestore
     * 
     * @param startPos
     * 
     * @throws IOException
     */
    public void runStartingFrom(int startPos) throws IOException {
        log.info("STARTING CSW to LOD..");

        if (!(addToServer || saveToFile)) {
            log.warn("Neither triple store nor file output are activated.");
            return;
        }

        final Stopwatch overallTimer = new Stopwatch();
        overallTimer.start();

        final Stopwatch retrievingTimer = new Stopwatch();
        final Stopwatch mappingTimer = new Stopwatch();
        final Stopwatch otherTimer = new Stopwatch();

        otherTimer.start();
        XmlToRdfMapper mapper = new GluesMapper(config);

        TripleSink serverSink = null;
        if (addToServer) {
            try {
                serverSink = new VirtuosoServer(config, mapper);
            } catch (RuntimeException e) {
                log.error("Could not connect to graph", e);
            }
        }

        TripleSink fileSink = null;
        if (saveToFile) {
            fileSink = new FileTripleSink(mapper);
        }

        long recordsInTotal = FALLBACK_RECORDS_TOTAL;
        try {
            recordsInTotal = csw.getNumberOfRecords();
            log.debug("Retrieved number of records from server: {}", recordsInTotal);
        } catch (IllegalStateException | HttpClientException | XmlException e) {
            log.error("Could not retrieve number of records from catalog {}, falling back to {}", csw,
                    FALLBACK_RECORDS_TOTAL, e);
        }
        report.startIndex = startPos;
        report.recordNumber = recordsInTotal;
        otherTimer.stop();

        // main loop
        while (startPos < recordsInTotal) {
            retrievingTimer.start();
            Map<String, GetRecordByIdResponseDocument> records = retrieveRecords(startPos,
                    NUMBER_OF_RECORDS_PER_ITERATION, recordsInTotal);
            retrievingTimer.stop();

            mappingTimer.start();
            if (addToServer && serverSink != null)
                serverSink.addRecords(records, report);
            if (saveToFile && fileSink != null)
                fileSink.addRecords(records, report);
            mappingTimer.stop();

            startPos = startPos + NUMBER_OF_RECORDS_PER_ITERATION;

            log.debug("Finished intermediate run at {}", overallTimer.toString());
        } // end of main loop

        otherTimer.start();
        if (fileSink != null)
            try {
                fileSink.close();
            } catch (Exception e) {
                log.error("Could not close file sink {}", fileSink, e);
            }

        if (serverSink != null)
            try {
                serverSink.close();
            } catch (Exception e) {
                log.error("Could not close server sink {}", serverSink, e);
            }

        if (!report.issues.isEmpty())
            log.error(report.extendedToString());

        overallTimer.stop();
        otherTimer.stop();

        log.info("DONE with CSW to LOD.. duration = {} (retrieving: {}, mapping = {}, other = {})", overallTimer,
                retrievingTimer, mappingTimer, otherTimer);
        log.info("Results: {}", report);
        log.info("Sinks: server = {}, file = {}", addToServer, saveToFile);
        log.info("Server: {} | File: {}", serverSink, fileSink);
    }

    public void asyncRunStartingFrom(final int startPos) throws IOException {
        log.info("STARTING CSW to LOD..");

        if (!(addToServer || saveToFile)) {
            log.warn("Neither triple store nor file output are activated.");
            return;
        }

        final Stopwatch overallTimer = new Stopwatch();
        overallTimer.start();

        final Stopwatch retrievingTimer = new Stopwatch();
        final Stopwatch mappingTimer = new Stopwatch();
        final Stopwatch otherTimer = new Stopwatch();

        otherTimer.start();
        XmlToRdfMapper mapper = new GluesMapper(config);

        TripleSink serverSink = null;
        if (addToServer) {
            try {
                serverSink = new VirtuosoServer(config, mapper);
            } catch (RuntimeException e) {
                log.error("Could not connect to graph", e);
            }
        }

        TripleSink fileSink = null;
        if (saveToFile) {
            fileSink = new FileTripleSink(mapper);
        }

        long recordsInTotal;
        try {
            recordsInTotal = csw.getNumberOfRecords();
            log.debug("Retrieved number of records from server: {}", recordsInTotal);
        } catch (IllegalStateException | HttpClientException | XmlException e) {
            log.error("Could not retrieve number of records from catalog {}, falling back to {}", csw,
                    FALLBACK_RECORDS_TOTAL, e);
            recordsInTotal = FALLBACK_RECORDS_TOTAL;
        }
        report.startIndex = startPos;
        report.recordNumber = recordsInTotal;
        otherTimer.stop();

        async(startPos, recordsInTotal, overallTimer, retrievingTimer, mappingTimer, serverSink, fileSink);

        otherTimer.start();
        if (fileSink != null)
            try {
                fileSink.close();
            } catch (Exception e) {
                log.error("Could not close file sink {}", fileSink, e);
            }

        if (serverSink != null)
            try {
                serverSink.close();
            } catch (Exception e) {
                log.error("Could not close server sink {}", serverSink, e);
            }

        if (!report.issues.isEmpty())
            log.error(report.extendedToString());

        overallTimer.stop();
        otherTimer.stop();

        log.info("DONE with CSW to LOD.. duration = {} (retrieving: {}, mapping = {}, other = {})", overallTimer,
                retrievingTimer, mappingTimer, otherTimer);
        log.info("Results: {}", report);
        log.info("Sinks: server = {}, file = {}", addToServer, saveToFile);
        log.info("Server: {} | File: {}", serverSink, fileSink);
    }

    private void async(final int startPos, final long recordCount, final Stopwatch overallTimer,
            final Stopwatch retrievingTimer, final Stopwatch mappingTimer, final TripleSink serverSink,
            final TripleSink fileSink) {
        // processing queue
        final ConcurrentLinkedQueue<Map<String, GetRecordByIdResponseDocument>> queue = Queues
                .newConcurrentLinkedQueue();

        // main loop download - producer
        ExecutorService downloadExecutor = Executors.newSingleThreadExecutor();
        downloadExecutor.submit(new Runnable() {

            private final Logger logger = LoggerFactory.getLogger("Download Runnable");

            @Override
            public void run() {
                int i = startPos;
                while (i < recordCount) {
                    retrievingTimer.start();
                    // Map<String, GetRecordByIdResponseDocument> records =
                    // retrieveRecords(i, NUMBER_OF_RECORDS_PER_ITERATION,
                    // recordCount);
                    Map<String, GetRecordByIdResponseDocument> records = retrieveRecordsThreaded(i,
                            NUMBER_OF_RECORDS_PER_ITERATION, recordCount);
                    queue.add(records);
                    retrievingTimer.stop();

                    i = i + NUMBER_OF_RECORDS_PER_ITERATION;
                    logger.debug("Finished intermediate download run at {}", overallTimer.toString());
                    logger.info("Retrieved {} records, queue size is now {}", records.size(), queue.size());
                } // end of main loop

                logger.trace("Done - adding the poison pill!");
                queue.add(POISON_PILL);
            }
        });

        // consumer
        ExecutorService mapExecutor = Executors.newSingleThreadExecutor();
        mapExecutor.submit(new Runnable() {

            private final Logger logger = LoggerFactory.getLogger("Map Runnable");

            private boolean isRunning = true;

            @Override
            public void run() {
                while (isRunning) {
                    try {
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                        logger.error("Error sleeping in mapping runnable", e);
                    }

                    try {
                        Map<String, GetRecordByIdResponseDocument> records = queue.poll();

                        if (records == null)
                            continue;

                        if (records == POISON_PILL) {
                            queue.add(POISON_PILL); // notify other threads to
                                                    // stop
                            isRunning = false;
                            logger.trace("Got the poison pill!");
                            return;
                        }

                        // process queueElement
                        mappingTimer.start();
                        if (addToServer && serverSink != null)
                            serverSink.addRecords(records, report);
                        if (saveToFile && fileSink != null)
                            fileSink.addRecords(records, report);
                        mappingTimer.stop();

                        logger.debug("Finished intermediate run at {}", overallTimer.toString());

                    } catch (RuntimeException e) {
                        logger.error("Error in mapping runnable", e);
                    }
                } // end of main loop
            }
        });

        downloadExecutor.shutdown();
        try {
            downloadExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
        } catch (InterruptedException e) {
            log.error("during shut down of download executor", e);
        }
        mapExecutor.shutdown();
        try {
            mapExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
        } catch (InterruptedException e) {
            log.error("during shut down of map executor", e);
        }
    }

    protected Map<String, GetRecordByIdResponseDocument> retrieveRecords(int startPos, int maxRecords,
            long recordsInTotal) {
        log.info("Retrieve {} records, starting from {} of {}", maxRecords, startPos, recordsInTotal);

        List<String> recordIdList = getRecordIds(startPos, maxRecords);

        Map<String, GetRecordByIdResponseDocument> recordDescriptions = getRecordDescriptions(startPos, maxRecords,
                recordsInTotal, recordIdList);

        log.info("Done with requests and parsing, have {} GetRecordById documents.", recordDescriptions.size());
        return recordDescriptions;
    }

    private Map<String, GetRecordByIdResponseDocument> getRecordDescriptions(int startPos, int maxRecords,
            long recordsInTotal, List<String> recordIdList) {
        Map<String, GetRecordByIdResponseDocument> recordDescriptions = Maps.newHashMap();

        int i = 0;
        for (String id : recordIdList) {
            log.debug("Retrieving details of record {}/{}  (over all {}/{}): {}", i, maxRecords, startPos + i,
                    recordsInTotal, id);

            try {
                String recordDescription = csw.executeGetRecordsById(id);
                GetRecordByIdResponseDocument xb_getRecordByIdResponse = GetRecordByIdResponseDocument.Factory
                        .parse(recordDescription, xmlOptions);

                recordDescriptions.put(id, xb_getRecordByIdResponse);
            } catch (OXFException | RuntimeException | ExceptionReport | XmlException e) {
                log.error("Error retrieving and parsing record {} with id {}", i, id, e);
                report.retrievalIssues.put(id, e);
            }

            i++;
        }
        return recordDescriptions;
    }

    private List<String> getRecordIds(int startPos, int maxRecords) {
        ArrayList<String> recordIdList = Lists.newArrayList();

        SearchResultsType searchResults = null;
        try {
            String result = csw.executeGetRecords(maxRecords, startPos);
            GetRecordsResponseDocument responseDoc = GetRecordsResponseDocument.Factory.parse(result);
            searchResults = responseDoc.getGetRecordsResponse().getSearchResults();

        } catch (OXFException | ExceptionReport | XmlException e) {
            log.error("Could not retrieving and parsing records {} to {}", startPos, startPos + maxRecords, e);
            report.retrievalIssues.put("Request for " + startPos + " - " + startPos + maxRecords + " failed.", e);
            return Lists.newArrayList();
        }

        AbstractRecordType[] abstractRecordArray = searchResults.getAbstractRecordArray();
        for (AbstractRecordType abstractRecordType : abstractRecordArray) {
            try {
                BriefRecordType abstractRecord = BriefRecordType.Factory.parse(abstractRecordType.xmlText());
                if (abstractRecord.getIdentifierArray() != null
                        && abstractRecord.getIdentifierArray().length >= 1) {
                    SimpleLiteral identifierLiteral = SimpleLiteral.Factory
                            .parse(abstractRecord.getIdentifierArray(0).getDomNode());
                    String recordId = identifierLiteral.getDomNode().getChildNodes().item(0).getChildNodes().item(0)
                            .getNodeValue();

                    recordIdList.add(recordId);
                }
            } catch (XmlException e) {
                log.error("Could not parse record {}", abstractRecordType.xmlText(), e);
                report.retrievalIssues.put("Parsing records response", e);
                return Lists.newArrayList();
            }
        }
        log.debug("Found {} record ids based on catalog response with {} matched and {} returned",
                recordIdList.size(), searchResults.getNumberOfRecordsMatched(),
                searchResults.getNumberOfRecordsReturned());
        return recordIdList;
    }

    private class CallableRecordDescription implements Callable<GetRecordByIdResponseDocument> {

        private final Logger logger = LoggerFactory.getLogger(CallableRecordDescription.class);

        private String id;

        private CatalogInteractor csw;

        public CallableRecordDescription(String id, CatalogInteractor csw) {
            this.id = id;
            this.csw = csw;
        }

        @Override
        public GetRecordByIdResponseDocument call() throws Exception {
            logger.debug("Retrieving {} using {}", this.id, this.csw);
            String recordDescription = this.csw.executeGetRecordsById(this.id);
            GetRecordByIdResponseDocument xb_getRecordByIdResponse = GetRecordByIdResponseDocument.Factory
                    .parse(recordDescription, xmlOptions);
            return xb_getRecordByIdResponse;
        }

    }

    protected Map<String, GetRecordByIdResponseDocument> retrieveRecordsThreaded(int startPos, int maxRecords,
            long recordsInTotal) {
        log.info("Retrieve {} records, starting from {} of {}", maxRecords, startPos, recordsInTotal);

        // one thread for getting ids
        List<String> recordIdList = getRecordIds(startPos, maxRecords);

        // many threads getting records descriptions
        final Map<String, GetRecordByIdResponseDocument> recordDescriptions = Maps.newConcurrentMap();

        ListeningExecutorService executorService = MoreExecutors
                .listeningDecorator(Executors.newFixedThreadPool(maxRecords));

        for (String id : recordIdList) {
            final String recordId = id;
            log.debug("Adding {} to the model", recordId);

            CallableRecordDescription c = new CallableRecordDescription(id, csw);
            ListenableFuture<GetRecordByIdResponseDocument> responseFuture = executorService.submit(c);

            Futures.addCallback(responseFuture, new FutureCallback<GetRecordByIdResponseDocument>() {

                private final Logger logger = LoggerFactory.getLogger("Record Downloader");

                @Override
                public void onFailure(Throwable t) {
                    logger.error("Error retrieving and parsing record {}", t);
                    report.retrievalIssues.put(recordId, t);
                }

                @Override
                public void onSuccess(GetRecordByIdResponseDocument result) {
                    logger.trace("SUCCESS with {}", result);
                    recordDescriptions.put(recordId, result);

                    report.added++;
                    report.addedIds.add(recordId);
                }

            });
        }

        executorService.shutdown();
        while (!executorService.isTerminated()) {
            try {
                executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
            } catch (InterruptedException e) {
                log.error("Could not await termination", e);
            }
        }

        log.info("Done with requests and parsing, have {} GetRecordById documents.", recordDescriptions.size());
        return recordDescriptions;
    }

    @Override
    public String toString() {
        StringBuilder builder = new StringBuilder();
        builder.append("CSWLoDEnabler [addToTripleStore=").append(addToServer).append(", saveToFile=")
                .append(saveToFile).append("]");
        return builder.toString();
    }

}