org.entrystore.repository.util.SolrSupport.java Source code

Java tutorial

Introduction

Here is the source code for org.entrystore.repository.util.SolrSupport.java

Source

/*
 * Copyright (c) 2007-2014 MetaSolutions AB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.entrystore.repository.util;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;

import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.entrystore.repository.Context;
import org.entrystore.repository.ContextManager;
import org.entrystore.repository.Data;
import org.entrystore.repository.Entry;
import org.entrystore.repository.EntryType;
import org.entrystore.repository.PrincipalManager;
import org.entrystore.repository.PrincipalManager.AccessProperty;
import org.entrystore.repository.RepositoryManager;
import org.entrystore.repository.ResourceType;
import org.entrystore.repository.config.Settings;
import org.entrystore.repository.impl.LocalMetadataWrapper;
import org.entrystore.repository.security.AuthorizationException;
import org.openrdf.model.Graph;
import org.openrdf.model.Literal;
import org.openrdf.model.Statement;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author Hannes Ebner
 */
public class SolrSupport {

    private static Logger log = LoggerFactory.getLogger(SolrSupport.class);

    private static int BATCH_SIZE = 1000;

    private boolean reindexing = false;

    private boolean extractFulltext = false;

    private RepositoryManager rm;

    private SolrServer solrServer;

    private Thread documentSubmitter;

    private final ConcurrentLinkedQueue<SolrInputDocument> postQueue = new ConcurrentLinkedQueue<SolrInputDocument>();

    public class SolrInputDocumentSubmitter extends Thread {

        @Override
        public void run() {
            while (!interrupted()) {
                if (!postQueue.isEmpty()) {
                    UpdateRequest req = new UpdateRequest();
                    req.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, false);

                    for (int i = 0; i < BATCH_SIZE; i++) {
                        SolrInputDocument doc = postQueue.poll();
                        if (doc == null) {
                            break;
                        }
                        req.add(doc);
                    }

                    try {
                        log.info("Sending commit with " + req.getDocuments().size() + " entries to Solr, "
                                + postQueue.size() + " documents remaining in post queue");
                        req.process(solrServer);
                    } catch (SolrServerException sse) {
                        log.error(sse.getMessage(), sse);
                    } catch (IOException ioe) {
                        log.error(ioe.getMessage(), ioe);
                    }
                } else {
                    try {
                        Thread.sleep(5000);
                    } catch (InterruptedException ie) {
                        log.info("Solr document submitter got interrupted, shutting down submitter thread");
                        return;
                    }
                }
            }
        }

    }

    public SolrSupport(RepositoryManager rm, SolrServer solrServer) {
        this.rm = rm;
        this.solrServer = solrServer;
        this.extractFulltext = "on"
                .equalsIgnoreCase(rm.getConfiguration().getString(Settings.SOLR_EXTRACT_FULLTEXT, "off"));
        documentSubmitter = new SolrInputDocumentSubmitter();
        documentSubmitter.start();
    }

    public void shutdown() {
        if (documentSubmitter != null) {
            documentSubmitter.interrupt();
        }
    }

    public void clearSolrIndex(SolrServer solrServer) {
        UpdateRequest req = new UpdateRequest();
        req.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, false);
        req.deleteByQuery("*:*");
        try {
            req.process(solrServer);
        } catch (SolrServerException sse) {
            log.error(sse.getMessage(), sse);
        } catch (IOException ioe) {
            log.error(ioe.getMessage(), ioe);
        }
    }

    /**
     * Reindexes the Solr index. Does not return before the process is
     * completed. All subsequent calls to this method are ignored until other
     * eventually running reindexing processes are completed, this means that
     * this method is thread-safe.
     */
    public void reindexLiterals() {
        if (solrServer == null) {
            log.warn("Ignoring request as Solr is not used by this instance");
            return;
        }

        synchronized (solrServer) {
            if (reindexing) {
                log.warn("Solr is already being reindexed: ignoring additional reindexing request");
                return;
            } else {
                reindexing = true;
            }
        }

        try {
            clearSolrIndex(solrServer);

            PrincipalManager pm = rm.getPrincipalManager();
            URI currentUser = pm.getAuthenticatedUserURI();
            try {
                pm.setAuthenticatedUserURI(pm.getAdminUser().getURI());
                ContextManager cm = rm.getContextManager();
                Set<URI> contexts = cm.getEntries();

                for (URI contextURI : contexts) {
                    String id = contextURI.toString().substring(contextURI.toString().lastIndexOf("/") + 1);
                    Context context = cm.getContext(id);
                    if (context != null) {
                        Set<URI> entries = context.getEntries();
                        for (URI entryURI : entries) {
                            if (entryURI != null) {
                                Entry entry = cm.getEntry(entryURI);
                                if (entry == null) {
                                    continue;
                                }
                                log.info("Adding document to Solr post queue: " + entryURI);
                                postQueue.add(constructSolrInputDocument(entry, extractFulltext));
                            }
                        }
                    }
                }
            } finally {
                pm.setAuthenticatedUserURI(currentUser);
            }
        } finally {
            reindexing = false;
        }
    }

    public static SolrInputDocument constructSolrInputDocument(Entry entry, boolean extractFulltext) {
        Graph mdGraph = entry.getMetadataGraph();
        URI resourceURI = entry.getResourceURI();

        SolrInputDocument doc = new SolrInputDocument();

        // URI
        doc.setField("uri", entry.getEntryURI().toString());

        // resource URI
        doc.setField("resource", resourceURI.toString());

        // resource URI of the surrounding context
        doc.setField("context", entry.getContext().getEntry().getResourceURI().toString());

        // RDF type
        String rdfTypeE = EntryUtil.getResource(entry.getGraph(), resourceURI, RDF.TYPE);
        if (rdfTypeE != null) {
            doc.addField("rdfType", rdfTypeE);
        }
        String rdfTypeM = EntryUtil.getResource(mdGraph, resourceURI, RDF.TYPE);
        if (rdfTypeM != null) {
            doc.addField("rdfType", rdfTypeM);
        }

        // creation date
        Date creationDate = entry.getCreationDate();
        if (creationDate != null) {
            doc.setField("created", creationDate);
        }

        // modification date
        Date modificationDate = entry.getModifiedDate();
        if (modificationDate != null) {
            doc.setField("modified", modificationDate);
        }

        // types
        doc.setField("graphType", entry.getGraphType().name());
        doc.setField("entryType", entry.getEntryType().name());
        doc.setField("resourceType", entry.getResourceType().name());

        // creator
        URI creator = entry.getCreator();
        if (creator != null) {
            doc.setField("creator", creator.toString());
        }

        // contributors
        doc.addField("contributors", entry.getContributors());

        // lists
        doc.addField("lists", entry.getReferringListsInSameContext());

        // ACL: admin, metadata r/w, resource r/w

        doc.addField("admin", entry.getAllowedPrincipalsFor(AccessProperty.Administer));
        doc.addField("metadata.r", entry.getAllowedPrincipalsFor(AccessProperty.ReadMetadata));
        doc.addField("metadata.rw", entry.getAllowedPrincipalsFor(AccessProperty.WriteMetadata));
        doc.addField("resource.r", entry.getAllowedPrincipalsFor(AccessProperty.ReadResource));
        doc.addField("resource.rw", entry.getAllowedPrincipalsFor(AccessProperty.WriteResource));

        // titles
        Map<String, String> titles = EntryUtil.getTitles(entry);
        if (titles != null && titles.size() > 0) {
            Set<String> langs = new HashSet<String>();
            for (String title : titles.keySet()) {
                doc.addField("title", title, 10);
                // we also store title.{lang} as dynamic field to be able to
                // sort after titles in a specific language
                String lang = titles.get(title);
                // we only want one title per language, otherwise sorting will not work
                if (lang != null && !langs.contains(lang)) {
                    doc.addField("title." + lang, title, 10);
                    langs.add(lang);
                }
            }
        }
        String firstName = EntryUtil.getFirstName(entry);
        String lastName = EntryUtil.getLastName(entry);
        String name = "";
        if (firstName != null) {
            name += firstName;
        }
        if (lastName != null) {
            name += " " + lastName;
        }
        if (name.length() > 0) {
            doc.addField("title", name, 10);
        }

        // description
        Map<String, String> descriptions = EntryUtil.getDescriptions(entry);
        if (descriptions != null && descriptions.size() > 0) {
            for (String description : descriptions.keySet()) {
                doc.addField("description", description);
                String lang = descriptions.get(description);
                if (lang != null) {
                    doc.addField("description." + lang, description);
                }
            }
        }

        // keywords
        Map<String, String> keywords = EntryUtil.getKeywords(entry);
        if (keywords != null && keywords.size() > 0) {
            for (String keyword : keywords.keySet()) {
                doc.addField("keyword", keyword, 20);
                String lang = descriptions.get(keyword);
                if (lang != null) {
                    doc.addField("keyword." + lang, keyword, 20);
                }
            }
        }

        // language of the resource
        String dcLang = EntryUtil.getLabel(mdGraph, resourceURI, new URIImpl(NS.dc + "language"), null);
        if (dcLang != null) {
            doc.addField("lang", dcLang);
        }
        String dctLang = EntryUtil.getLabel(mdGraph, resourceURI, new URIImpl(NS.dcterms + "language"), null);
        if (dctLang != null) {
            doc.addField("lang", dctLang);
        }

        // tags (dc:subject)
        Iterator<Statement> tags = mdGraph.match(null, new URIImpl(NS.dc + "subject"), null);
        while (tags.hasNext()) {
            doc.addField("tag", tags.next().getObject().stringValue());
        }

        // email (foaf:mbox)
        String email = EntryUtil.getEmail(entry);
        if (email != null) {
            doc.addField("email", email);
        }

        // publicly viewable metadata?
        boolean guestReadable = false;
        PrincipalManager pm = entry.getRepositoryManager().getPrincipalManager();
        pm.setAuthenticatedUserURI(pm.getGuestUser().getURI());
        try {
            pm.checkAuthenticatedUserAuthorized(entry, AccessProperty.ReadMetadata);
            guestReadable = true;
        } catch (AuthorizationException ae) {
        }
        pm.setAuthenticatedUserURI(pm.getAdminUser().getURI());
        doc.setField("public", guestReadable);

        //      // validated resource?
        //      doc.setField("validated", ConverterUtil.isValidated(mdGraph, resourceURI));

        // all literal values
        Graph metadata = entry.getMetadataGraph();
        if (metadata != null) {
            for (Statement s : metadata) {
                if (s.getObject() instanceof Literal) {
                    Literal l = (Literal) s.getObject();
                    // we only index plain literals (human-readable text)
                    if (l.getDatatype() == null) {
                        doc.addField("literal", l.getLabel());
                    }
                }
            }
        }

        // all predicates
        metadata = entry.getMetadataGraph();
        if (metadata != null) {
            for (Statement s : metadata) {
                doc.addField("predicate", s.getPredicate().stringValue());
            }
        }

        // Full text extraction using Apache Tika
        if (extractFulltext && EntryType.Local.equals(entry.getEntryType())
                && ResourceType.InformationResource.equals(entry.getResourceType())
                && entry.getResource() instanceof Data) {
            Data d = (Data) entry.getResource();
            File f = d.getDataFile();
            if (f != null && f.exists()) {
                String textContent = extractFulltext(f);
                if (textContent != null) {
                    doc.addField("fulltext", textContent);
                }
            }
        }

        return doc;
    }

    public void postEntry(Entry entry, SolrServer solrServer) {
        PrincipalManager pm = entry.getRepositoryManager().getPrincipalManager();
        URI currentUser = pm.getAuthenticatedUserURI();
        pm.setAuthenticatedUserURI(pm.getAdminUser().getURI());
        try {
            log.info("Adding document to Solr post queue: " + entry.getEntryURI());
            postQueue.add(constructSolrInputDocument(entry, extractFulltext));
        } finally {
            pm.setAuthenticatedUserURI(currentUser);
        }
    }

    public void removeEntry(Entry entry, SolrServer solrServer) {
        UpdateRequest req = new UpdateRequest();
        req.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, false);
        String escapedURI = StringUtils.replace(entry.getEntryURI().toString(), ":", "\\:");
        req.deleteByQuery("uri:" + escapedURI);
        try {
            log.info("Removing document from Solr: " + entry.getEntryURI());
            UpdateResponse res = req.process(solrServer);
            if (res.getStatus() > 0) {
                log.error("Removal request was unsuccessful with status " + res.getStatus());
            }
        } catch (SolrServerException sse) {
            log.error(sse.getMessage(), sse);
        } catch (IOException ioe) {
            log.error(ioe.getMessage(), ioe);
        }
    }

    private long sendQueryForEntryURIs(SolrQuery query, Set<URI> result, SolrServer solrServer, int offset,
            int limit) {
        if (offset > -1) {
            query.setStart(offset);
        }
        if (limit > -1) {
            query.setRows(limit);
        }

        long hits = -1;

        Date before = new Date();
        QueryResponse r = null;
        try {
            r = solrServer.query(query);
            SolrDocumentList docs = r.getResults();
            hits = docs.getNumFound();
            for (SolrDocument solrDocument : docs) {
                if (solrDocument.containsKey("uri")) {
                    String uri = (String) solrDocument.getFieldValue("uri");
                    if (uri != null) {
                        result.add(URI.create(uri));
                    }
                }
            }
        } catch (SolrServerException e) {
            log.error(e.getMessage());
        }
        log.info("Solr query took " + (new Date().getTime() - before.getTime()) + " ms");

        return hits;
    }

    public QueryResult sendQuery(SolrQuery query) {
        Set<URI> entries = new LinkedHashSet<URI>();
        Set<Entry> result = new LinkedHashSet<Entry>();
        long hits = -1;
        int limit = query.getRows();
        int offset = query.getStart();
        query.setIncludeScore(true);
        int resultFillIteration = 0;
        do {
            if (resultFillIteration++ > 0) {
                if (resultFillIteration > 10) {
                    log.warn("Breaking after 10 result fill interations to prevent too many loops");
                    break;
                }
                offset += 10;
                log.warn("Increasing offset to fill the result limit");
            }
            hits = sendQueryForEntryURIs(query, entries, solrServer, offset, -1);
            Date before = new Date();
            for (URI uri : entries) {
                try {
                    Entry entry = rm.getContextManager().getEntry(uri);
                    if (entry != null) {
                        PrincipalManager pm = entry.getRepositoryManager().getPrincipalManager();
                        // If linkReference or reference to a entry in the same
                        // repository
                        // check that the referenced metadata is accessible.
                        if ((entry.getEntryType() == EntryType.Reference
                                || entry.getEntryType() == EntryType.LinkReference)
                                && entry.getCachedExternalMetadata() instanceof LocalMetadataWrapper) {
                            Entry refEntry = entry.getRepositoryManager().getContextManager()
                                    .getEntry(entry.getExternalMetadataURI());
                            pm.checkAuthenticatedUserAuthorized(refEntry, AccessProperty.ReadMetadata);
                        } else {
                            // Check that the local metadata is accessible.
                            pm.checkAuthenticatedUserAuthorized(entry, AccessProperty.ReadMetadata);
                        }
                        result.add(entry);
                        if (result.size() == limit) {
                            // we have enough results
                            break;
                        }
                    }
                } catch (AuthorizationException ae) {
                    hits--;
                    continue;
                }
            }
            log.info("Entry fetching took " + (new Date().getTime() - before.getTime()) + " ms");
        } while ((limit > result.size()) && (hits > (offset + limit)));

        return new QueryResult(result, hits);
    }

    public static String extractFulltext(File f) {
        return null;

        // FIXME this method works but is deactivated. the needed apache tika
        // framework has many dependencies so it is not activated in the pom.xml
        // of scam-core for now. please activate it there before enabling the
        // following code again.

        /*
         * InputStream stream = null; String textContent = null; String mimeType
         * = null; try { TikaConfig tc = TikaConfig.getDefaultConfig();
         * InputStream mimeIS = null; try { mimeIS = new FileInputStream(f);
         * mimeType = tc.getMimeRepository().getMimeType(mimeIS).getName(); }
         * finally { if (mimeIS != null) { mimeIS.close(); } }
         * 
         * if (mimeType != null) { stream = new BufferedInputStream(new
         * FileInputStream(f)); Parser parser = tc.getParser(mimeType); if
         * (parser != null) { ContentHandler handler = new BodyContentHandler();
         * try { log.info("Parsing document with MIME type " + mimeType + ": " +
         * f.toString()); parser.parse(stream, handler, new Metadata(), new
         * ParseContext()); textContent = handler.toString(); } catch (Exception
         * e) { log.error("Unable to parse document: " + e.getMessage()); } }
         * else { log.warn("Unable to detect parser for MIME type " + mimeType);
         * } } else { log.warn("Unable to detect the MIME type"); } } catch
         * (IOException e) { log.error(e.getMessage()); } finally { try { if
         * (stream != null) { stream.close(); } } catch (IOException e) {
         * log.error(e.getMessage()); } } return textContent;
         */
    }

}