org.apache.marmotta.kiwi.loader.generic.KiWiHandler.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.marmotta.kiwi.loader.generic.KiWiHandler.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.marmotta.kiwi.loader.generic;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import org.apache.marmotta.commons.sesame.model.Namespaces;
import org.apache.marmotta.commons.sesame.tripletable.IntArray;
import org.apache.marmotta.commons.vocabulary.XSD;
import org.apache.marmotta.kiwi.loader.KiWiLoaderConfiguration;
import org.apache.marmotta.kiwi.model.rdf.*;
import org.apache.marmotta.kiwi.persistence.KiWiConnection;
import org.apache.marmotta.kiwi.persistence.registry.CacheTripleRegistry;
import org.apache.marmotta.kiwi.persistence.registry.DBTripleRegistry;
import org.apache.marmotta.kiwi.persistence.registry.KiWiTripleRegistry;
import org.apache.marmotta.kiwi.persistence.registry.LocalTripleRegistry;
import org.apache.marmotta.kiwi.sail.KiWiStore;
import org.joda.time.DateTime;
import org.joda.time.format.ISODateTimeFormat;
import org.openrdf.model.*;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFHandlerException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.sql.SQLException;
import java.util.Date;
import java.util.IllformedLocaleException;
import java.util.Locale;
import java.util.concurrent.ExecutionException;

/**
 * A fast-lane RDF import handler that allows bulk-importing triples into a KiWi triplestore. It directly accesses
 * the database using a KiWiConnection. Note that certain configuration options will make the import "unsafe"
 * because they turn off expensive existence checks. If you are not careful and import the same data twice, this
 * might mean duplicate entries in the database.
 *
 * @author Sebastian Schaffert (sschaffert@apache.org)
 */
public class KiWiHandler implements RDFHandler {

    private static Logger log = LoggerFactory.getLogger(KiWiHandler.class);

    protected KiWiConnection connection;
    protected KiWiStore store;

    protected long triples = 0;
    protected long nodes = 0;
    protected long nodesLoaded = 0;

    protected long start = 0;
    protected long previous = 0;

    protected KiWiLoaderConfiguration config;

    protected LoadingCache<String, Locale> localeCache;

    // if non-null, all imported statements will have this context (regardless whether they specified a different context)
    private KiWiResource overrideContext;

    // only used when statement existance check is enabled
    protected KiWiTripleRegistry registry;

    protected Date importDate;

    protected boolean initialised = false;

    public KiWiHandler(KiWiStore store, KiWiLoaderConfiguration config) {
        this.config = config;
        this.store = store;

        this.localeCache = CacheBuilder.newBuilder().maximumSize(100).build(new CacheLoader<String, Locale>() {
            @Override
            public Locale load(String lang) throws Exception {
                try {
                    Locale.Builder builder = new Locale.Builder();
                    builder.setLanguageTag(lang);
                    return builder.build();
                } catch (IllformedLocaleException ex) {
                    log.warn("malformed language literal (language: {})", lang);
                    return null;
                }
            }
        });

        if (config.isStatementExistanceCheck()) {
            switch (store.getPersistence().getConfiguration().getRegistryStrategy()) {
            case DATABASE:
                log.info("KiWi Loader: database registry");
                registry = new DBTripleRegistry(store);
                break;
            case CACHE:
                log.info("KiWi Loader: cache registry");
                registry = new CacheTripleRegistry(store.getPersistence().getCacheManager());
                break;
            case LOCAL:
                log.info("KiWi Loader: in-memory registry");
                registry = new LocalTripleRegistry();
                break;
            default:
                log.info("KiWi Loader: in-memory registry");
                registry = new LocalTripleRegistry();
            }
        }

        log.info("KiWi Loader: namespaces {}", config.isIgnoreNamespaces() ? "ignored" : "enabled");
    }

    /**
     * Perform initialisation, e.g. dropping indexes or other preparations.
     */
    public void initialise() throws RDFHandlerException {
        log.info("KiWiLoader: initialising RDF handler");
        try {
            this.connection = store.getPersistence().getConnection();
        } catch (SQLException e) {
            throw new RDFHandlerException(e);
        }

        initialised = true;
    }

    /**
     * Peform cleanup on shutdown, e.g. re-creating indexes after import completed
     */
    public void shutdown() throws RDFHandlerException {
        log.info("KiWiLoader: shutting down RDF handler");
        if (connection != null) {
            try {
                connection.close();
            } catch (SQLException e) {
                throw new RDFHandlerException(e);
            }
        }

        initialised = false;

    }

    /**
     * Signals the end of the RDF data. This method is called when all data has
     * been reported.
     *
     * @throws org.openrdf.rio.RDFHandlerException
     *          If the RDF handler has encountered an unrecoverable error.
     */
    @Override
    public void endRDF() throws RDFHandlerException {

        if (registry != null) {
            registry.releaseTransaction(connection.getTransactionId());
        }

        try {
            connection.commit();
        } catch (SQLException e) {
            throw new RDFHandlerException(e);
        }

        log.debug("KiWiLoader: RDF bulk import of {} triples finished after {} ms", triples,
                System.currentTimeMillis() - start);
    }

    /**
     * Signals the start of the RDF data. This method is called before any data
     * is reported.
     *
     * @throws org.openrdf.rio.RDFHandlerException
     *          If the RDF handler has encountered an unrecoverable error.
     */
    @Override
    public void startRDF() throws RDFHandlerException {
        if (!initialised) {
            initialise();
        }
        log.debug("KiWiLoader: starting RDF bulk import");

        this.start = System.currentTimeMillis();
        this.previous = System.currentTimeMillis();

        this.importDate = new Date(this.start);

        if (config.getContext() != null) {
            try {
                this.overrideContext = (KiWiResource) convertNode(new URIImpl(config.getContext()));
            } catch (ExecutionException e) {
                log.error("could not create/load resource", e);
            }
        }

    }

    /**
     * Handles a namespace declaration/definition. A namespace declaration
     * associates a (short) prefix string with the namespace's URI. The prefix
     * for default namespaces, which do not have an associated prefix, are
     * represented as empty strings.
     *
     * @param prefix The prefix for the namespace, or an empty string in case of a
     *               default namespace.
     * @param uri    The URI that the prefix maps to.
     * @throws org.openrdf.rio.RDFHandlerException
     *          If the RDF handler has encountered an unrecoverable error.
     */
    @Override
    public void handleNamespace(String prefix, String uri) throws RDFHandlerException {
        if (!config.isIgnoreNamespaces()) {
            try {
                KiWiNamespace result = connection.loadNamespaceByPrefix(prefix);
                if (result != null) {
                    if (!result.getUri().equals(uri)) {
                        connection.deleteNamespace(result);
                        connection.storeNamespace(new KiWiNamespace(prefix, uri));
                    }
                } else {
                    connection.storeNamespace(new KiWiNamespace(prefix, uri));
                }
            } catch (SQLException e) {
                throw new RDFHandlerException(e);
            }
        }
    }

    /**
     * Handles a statement.
     *
     * @param st The statement.
     * @throws org.openrdf.rio.RDFHandlerException
     *          If the RDF handler has encountered an unrecoverable error.
     */
    @Override
    public void handleStatement(Statement st) throws RDFHandlerException {
        try {
            KiWiResource subject = (KiWiResource) convertNode(st.getSubject());
            KiWiUriResource predicate = (KiWiUriResource) convertNode(st.getPredicate());
            KiWiNode object = convertNode(st.getObject());
            KiWiResource context;

            if (this.overrideContext != null) {
                context = this.overrideContext;
            } else {
                context = (KiWiResource) convertNode(st.getContext());
            }

            KiWiTriple result = new KiWiTriple(subject, predicate, object, context, importDate);

            // statement existance check; use the triple registry to lookup if there are any concurrent triple creations
            if (config.isStatementExistanceCheck()) {
                IntArray cacheKey = IntArray.createSPOCKey(subject, predicate, object, context);
                long tripleId = registry.lookupKey(cacheKey);

                if (tripleId >= 0) {
                    // try getting id from registry
                    result.setId(tripleId);

                    registry.registerKey(cacheKey, connection.getTransactionId(), result.getId());
                } else {
                    // not found in registry, try loading from database
                    result.setId(connection.getTripleId(subject, predicate, object, context));
                }

                // triple has no id from registry or database, so we create one and flag it for reasoning
                if (result.getId() < 0) {
                    result.setId(connection.getNextSequence());
                    result.setNewTriple(true);

                    registry.registerKey(cacheKey, connection.getTransactionId(), result.getId());

                    storeTriple(result);
                }
            } else {
                result.setId(connection.getNextSequence());

                storeTriple(result);
            }

        } catch (SQLException | ExecutionException e) {
            throw new RDFHandlerException(e);
        }

    }

    private KiWiNode convertNode(Value value) throws ExecutionException {
        if (value == null) {
            return null;
        } else if (value instanceof KiWiNode) {
            return (KiWiNode) value;
        } else if (value instanceof URI) {
            return createURI(((URI) value).stringValue());
        } else if (value instanceof BNode) {
            return createBNode(((BNode) value).stringValue());
        } else if (value instanceof Literal) {
            return createLiteral((Literal) value);
        } else {
            throw new IllegalArgumentException("the value passed as argument does not have the correct type");
        }
    }

    protected KiWiLiteral createLiteral(Literal l) throws ExecutionException {
        String value = l.getLabel();
        String lang = l.getLanguage() != null ? l.getLanguage().intern() : null;
        URI type = l.getDatatype();

        Locale locale;
        if (lang != null) {
            locale = localeCache.get(lang);
        } else {
            locale = null;
        }
        if (locale == null) {
            lang = null;
        }

        KiWiLiteral result;
        final KiWiUriResource rtype = type == null ? null : (KiWiUriResource) convertNode(type);

        try {

            try {
                // differentiate between the different types of the value
                if (type == null) {
                    // FIXME: MARMOTTA-39 (this is to avoid a NullPointerException in the following if-clauses)
                    result = connection.loadLiteral(sanitizeString(value), lang, null);

                    if (result == null) {
                        result = new KiWiStringLiteral(sanitizeString(value), locale, null, importDate);
                    } else {
                        nodesLoaded++;
                    }
                } else if (type.equals(XSD.DateTime) || type.equals(XSD.Date) || type.equals(XSD.Time)) {
                    // parse if necessary
                    final DateTime dvalue = ISODateTimeFormat.dateTimeParser().withOffsetParsed()
                            .parseDateTime(value);

                    result = connection.loadLiteral(dvalue);

                    if (result == null) {
                        result = new KiWiDateLiteral(dvalue, rtype, importDate);
                    } else {
                        nodesLoaded++;
                    }
                } else if (type.equals(XSD.Integer) || type.equals(XSD.Long)) {
                    long ivalue = Long.parseLong(value);

                    result = connection.loadLiteral(ivalue);

                    if (result == null) {
                        result = new KiWiIntLiteral(ivalue, rtype, importDate);
                    } else {
                        nodesLoaded++;
                    }
                } else if (type.equals(XSD.Double) || type.equals(XSD.Float) || type.equals(XSD.Decimal)) {
                    double dvalue = Double.parseDouble(value);

                    result = connection.loadLiteral(dvalue);

                    if (result == null) {
                        result = new KiWiDoubleLiteral(dvalue, rtype, importDate);
                    } else {
                        nodesLoaded++;
                    }
                } else if (type.equals(XSD.Boolean)) {
                    boolean bvalue = Boolean.parseBoolean(value);

                    result = connection.loadLiteral(bvalue);

                    if (result == null) {
                        result = new KiWiBooleanLiteral(bvalue, rtype, importDate);
                    } else {
                        nodesLoaded++;
                    }
                } else {
                    result = connection.loadLiteral(sanitizeString(value), lang, rtype);

                    if (result == null) {
                        result = new KiWiStringLiteral(sanitizeString(value), locale, rtype, importDate);
                    } else {
                        nodesLoaded++;
                    }
                }
            } catch (IllegalArgumentException ex) {
                // malformed number or date
                log.warn("malformed argument for typed literal of type {}: {}", rtype, value);
                KiWiUriResource mytype = createURI(Namespaces.NS_XSD + "string");

                result = connection.loadLiteral(sanitizeString(value), lang, mytype);

                if (result == null) {
                    result = new KiWiStringLiteral(sanitizeString(value), locale, mytype, importDate);
                } else {
                    nodesLoaded++;
                }

            }

            if (result.getId() < 0) {
                storeNode(result);
            }

            return result;

        } catch (SQLException e) {
            log.error("database error, could not load literal", e);
            throw new IllegalStateException("database error, could not load literal", e);
        }
    }

    protected KiWiUriResource createURI(String uri) {
        try {
            // first look in the registry for newly created resources if the resource has already been created and
            // is still volatile
            KiWiUriResource result = connection.loadUriResource(uri);

            if (result == null) {
                result = new KiWiUriResource(uri, importDate);

                storeNode(result);

            } else {
                nodesLoaded++;
            }
            if (result.getId() < 0) {
                log.error("node ID is null!");
            }

            return result;
        } catch (SQLException e) {
            log.error("database error, could not load URI resource", e);
            throw new IllegalStateException("database error, could not load URI resource", e);
        }
    }

    protected KiWiAnonResource createBNode(String nodeID) {
        try {
            // first look in the registry for newly created resources if the resource has already been created and
            // is still volatile
            KiWiAnonResource result = connection.loadAnonResource(nodeID);

            if (result == null) {
                result = new KiWiAnonResource(nodeID, importDate);
                storeNode(result);
            } else {
                nodesLoaded++;
            }
            if (result.getId() < 0) {
                log.error("node ID is null!");
            }

            return result;
        } catch (SQLException e) {
            log.error("database error, could not load anonymous resource", e);
            throw new IllegalStateException("database error, could not load anonymous resource", e);
        }
    }

    protected void storeNode(KiWiNode node) throws SQLException {
        connection.storeNode(node);

        nodes++;
    }

    protected void storeTriple(KiWiTriple result) throws SQLException {
        connection.storeTriple(result);

        triples++;

        if (triples % config.getCommitBatchSize() == 0) {
            if (registry != null) {
                registry.releaseTransaction(connection.getTransactionId());
            }

            connection.commit();
        }
    }

    /**
     * Handles a comment.
     *
     * @param comment The comment.
     * @throws org.openrdf.rio.RDFHandlerException
     *          If the RDF handler has encountered an unrecoverable error.
     */
    @Override
    public void handleComment(String comment) throws RDFHandlerException {
    }

    private static String sanitizeString(String in) {
        // clean up illegal characters
        return in.replaceAll("[\\00]", "");
    }
}