de.slub.fedora.oai.OaiHarvester.java Source code

Java tutorial

Introduction

Here is the source code for de.slub.fedora.oai.OaiHarvester.java

Source

/*
 * Copyright 2014 SLUB Dresden
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.slub.fedora.oai;

import de.slub.index.IndexJob;
import de.slub.index.ObjectIndexJob;
import de.slub.util.TerminateableRunnable;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.river.RiverName;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import javax.ws.rs.core.UriBuilder;
import javax.xml.bind.DatatypeConverter;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.TimeUnit;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

public class OaiHarvester extends TerminateableRunnable {

    private final ESLogger logger;
    private final URL url;
    private final TimeValue interval;
    private final Client client;
    private final RiverName riverName;
    private final Queue<IndexJob> jobQueue;
    private Date lastrun;
    private String resumptionToken;
    private Date expirationDate;

    protected OaiHarvester(URL harvestingUrl, TimeValue pollInterval, Client esClient, RiverName riverName,
            Queue<IndexJob> indexJobQueue, ESLogger logger) throws MalformedURLException, URISyntaxException {

        url = harvestingUrl;
        interval = pollInterval;
        client = esClient;
        jobQueue = indexJobQueue;
        this.riverName = riverName;
        this.logger = logger;

        logger.info("Harvesting URL: {} every {}", url.toExternalForm(), interval.format());
    }

    @Override
    public void run() {
        try {
            harvestLoop();
        } catch (Exception e) {
            logger.error(e.getMessage());
        }
    }

    private void harvestLoop() throws URISyntaxException, InterruptedException {
        while (isRunning()) {
            Date timestamp = now();
            getLastrunParameters();

            if (lastrun != null && timestamp.before(lastrun)) {
                TimeUnit.MILLISECONDS.sleep(interval.getMillis());
                continue;
            }

            URI uri = buildOaiRequestURI();
            HttpGet httpGet = new HttpGet(uri);
            CloseableHttpClient httpClient = HttpClients.createMinimal();

            try (CloseableHttpResponse httpResponse = httpClient.execute(httpGet)) {
                if (httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                    HttpEntity httpEntity = httpResponse.getEntity();
                    if (httpEntity != null) {
                        handleXmlResult(httpEntity.getContent());
                        writeLastrun(timestamp);
                    } else {
                        logger.warn("Got empty response from OAI service.");
                    }
                } else {
                    logger.error("Unexpected OAI service response: {} {}",
                            httpResponse.getStatusLine().getStatusCode(),
                            httpResponse.getStatusLine().getReasonPhrase());
                }
            } catch (Exception ex) {
                logger.error(ex.getMessage());
            } finally {
                if (resumptionTokenIsPresent()) {
                    TimeUnit.SECONDS.sleep(1);
                } else {
                    TimeUnit.MILLISECONDS.sleep(interval.getMillis());
                }
            }
        }
    }

    private boolean resumptionTokenIsPresent() {
        return resumptionToken != null && !resumptionToken.isEmpty();
    }

    private Date now() {
        return Calendar.getInstance().getTime();
    }

    private void getLastrunParameters() {
        GetResponse response = client.prepareGet(riverName.name(), riverName.type(), "_last").execute().actionGet();
        if (response.isExists()) {
            Map<String, Object> src = response.getSourceAsMap();
            lastrun = getDate(src, "timestamp");
            expirationDate = getDate(src, "expiration_date");
            resumptionToken = (String) src.get("resumption_token");
        }
    }

    private Date getDate(Map<String, Object> src, String param) {
        if (src.containsKey(param)) {
            String s = (String) src.get(param);
            if (!s.isEmpty()) {
                return DatatypeConverter.parseDateTime(s).getTime();
            }
        }
        return null;
    }

    private URI buildOaiRequestURI() throws URISyntaxException {
        UriBuilder builder = UriBuilder.fromUri(url.toURI()).queryParam("verb", "ListIdentifiers");

        if (isValidResumptionToken(resumptionToken, expirationDate)) {
            builder.queryParam("resumptionToken", resumptionToken);
        } else {
            builder.queryParam("metadataPrefix", "oai_dc");
            if (lastrun != null) {
                builder.queryParam("from", new SimpleDateFormat("YYYY-MM-DD").format(lastrun));
            }
        }

        return builder.build();
    }

    private boolean isValidResumptionToken(String token, Date expires) {
        return token != null && !token.isEmpty() && (expires == null || expires.after(now()));
    }

    private void writeLastrun(final Date timestamp) throws IOException {
        XContentBuilder jb = jsonBuilder().startObject();
        jb.field("timestamp", timestamp);
        if (resumptionTokenIsPresent()) {
            jb.field("resumption_token", resumptionToken);
        }
        if (expirationDate != null) {
            jb.field("expiration_date", expirationDate);
        }
        jb.endObject();
        client.prepareIndex(riverName.getName(), riverName.type(), "_last").setSource(jb).execute().actionGet();
    }

    private void handleXmlResult(InputStream content)
            throws ParserConfigurationException, IOException, SAXException, XPathExpressionException {

        DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
        Document document = documentBuilderFactory.newDocumentBuilder().parse(content);

        XPath xPath = XPathFactory.newInstance().newXPath();

        XPathExpression xSelectIdentifier = xPath.compile("//header/identifier");
        NodeList nodes = (NodeList) xSelectIdentifier.evaluate(document, XPathConstants.NODESET);
        for (int i = 0; i < nodes.getLength(); i++) {
            Node n = nodes.item(i);
            jobQueue.add(new ObjectIndexJob(IndexJob.Type.CREATE, getLocalIdentifier(n.getTextContent())));
        }

        XPathExpression xSelectResumptionToken = xPath.compile("//resumptionToken");
        resumptionToken = (String) xSelectResumptionToken.evaluate(document, XPathConstants.STRING);

        XPathExpression xSelectExpirationDate = xPath.compile("//resumptionToken/@expirationDate");
        String s = (String) xSelectExpirationDate.evaluate(document, XPathConstants.STRING);
        if (s == null || s.isEmpty()) {
            expirationDate = null;
        } else {
            expirationDate = DatatypeConverter.parseDateTime(s).getTime();
        }
    }

    private String getLocalIdentifier(String oaiId) {
        return oaiId.substring(oaiId.indexOf(':', "oai:".length()) + 1);
    }

}