edu.unc.lib.dl.cdr.services.techmd.TechnicalMetadataEnhancement.java Source code

Java tutorial

Introduction

Here is the source code for edu.unc.lib.dl.cdr.services.techmd.TechnicalMetadataEnhancement.java

Source

/**
 * Copyright 2008 The University of North Carolina at Chapel Hill
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.unc.lib.dl.cdr.services.techmd;

import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.FITS_NS;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.PREMIS_V2_NS;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.httpclient.util.URIUtil;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.Namespace;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.XMLOutputter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.unc.lib.dl.cdr.services.AbstractFedoraEnhancement;
import edu.unc.lib.dl.cdr.services.AbstractFedoraEnhancementService;
import edu.unc.lib.dl.cdr.services.AbstractIrodsObjectEnhancementService;
import edu.unc.lib.dl.cdr.services.exception.EnhancementException;
import edu.unc.lib.dl.cdr.services.exception.EnhancementException.Severity;
import edu.unc.lib.dl.cdr.services.model.EnhancementMessage;
import edu.unc.lib.dl.fedora.FedoraException;
import edu.unc.lib.dl.fedora.FileSystemException;
import edu.unc.lib.dl.fedora.NotFoundException;
import edu.unc.lib.dl.fedora.PID;
import edu.unc.lib.dl.util.ContentModelHelper;
import edu.unc.lib.dl.xml.FOXMLJDOMUtil;
import edu.unc.lib.dl.xml.JDOMNamespaceUtil;

/**
 * Executes irods script which uses FITS to extract technical metadata features of objects with data file datastreams.
 * 
 * @author Gregory Jansen
 * 
 */
public class TechnicalMetadataEnhancement extends AbstractFedoraEnhancement {

    Namespace ns = JDOMNamespaceUtil.FITS_NS;

    private static final Logger LOG = LoggerFactory.getLogger(TechnicalMetadataEnhancement.class);
    private static final int MAX_EXTENSION_LENGTH = 8;

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Runnable#run()
     */
    @Override
    public Element call() throws EnhancementException {
        Element result = null;
        // check to see if the service is still active
        if (!this.service.isActive()) {
            LOG.debug("{} call method exited, service is not active.", this.getClass().getName());
            return null;
        }

        Map<String, Document> ds2FitsDoc = new HashMap<String, Document>();
        try {
            Document foxml = this.retrieveFoxml();
            // get sourceData data stream IDs
            List<String> srcDSURIs = this.getSourceData(foxml);
            Map<String, String> sourceMimetype = new HashMap<String, String>(srcDSURIs.size());

            for (String srcURI : srcDSURIs) { // for each source datastream
                LOG.debug("source data URI: {}", srcURI);
                String dsid = srcURI.substring(srcURI.lastIndexOf("/") + 1);
                LOG.debug("datastream ID: {}", dsid);

                // get current datastream version ID
                String dsLocation = null;
                String dsIrodsPath = null;
                String dsAltIds = null;

                Element newestSourceDS = FOXMLJDOMUtil
                        .getMostRecentDatastream(ContentModelHelper.Datastream.getDatastream(dsid), foxml);
                if (newestSourceDS != null) {
                    sourceMimetype.put(dsid, newestSourceDS.getAttributeValue("MIMETYPE"));
                    dsLocation = newestSourceDS.getChild("contentLocation", JDOMNamespaceUtil.FOXML_NS)
                            .getAttributeValue("REF");
                    dsAltIds = newestSourceDS.getAttributeValue("ALT_IDS");
                } else {
                    throw new EnhancementException("Specified source datastream " + srcURI
                            + " was not found, the object " + this.pid.getPid() + " is most likely invalid",
                            Severity.UNRECOVERABLE);
                }

                // get logical iRODS path for datastream version
                dsIrodsPath = service.getManagementClient().getIrodsPath(dsLocation);

                // call fits via irods rule for the locations
                Document fits = null;
                try {
                    fits = runFITS(dsIrodsPath, dsAltIds);
                } catch (JDOMException e) {
                    // Rethrow JDOM exception as an unrecoverable enhancement exception
                    throw new EnhancementException(e, Severity.UNRECOVERABLE);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }

                // put the FITS document in DS map
                ds2FitsDoc.put(dsid, fits);
            }

            // build a PREMIS document
            Document premisTech = new Document();
            Element p = new Element("premis", PREMIS_V2_NS);
            premisTech.addContent(p);
            for (String dsid : ds2FitsDoc.keySet()) {
                // get key PREMIS data
                Document fits = ds2FitsDoc.get(dsid);
                String md5checksum = fits.getRootElement().getChild("fileinfo", FITS_NS).getChildText("md5checksum",
                        FITS_NS);
                String size = fits.getRootElement().getChild("fileinfo", FITS_NS).getChildText("size", FITS_NS);

                // IDENTIFICATION LOGIC
                // get mimetype out of FITS XML
                Element trustedIdentity = null;
                Element idn = fits.getRootElement().getChild("identification", ns);
                for (Object child : idn.getChildren("identity", ns)) {
                    Element el = (Element) child;
                    if (idn.getAttributeValue("status") == null || el.getChildren("tool", ns).size() > 1
                            || (!"Exiftool".equals(el.getChild("tool", ns).getAttributeValue("toolname"))
                                    && !"application/x-symlink".equals(el.getAttributeValue("mimetype")))) {
                        trustedIdentity = el;
                        break;
                    }
                }

                String fitsMimetype = null;
                String format = null;
                if (trustedIdentity != null) {
                    fitsMimetype = trustedIdentity.getAttributeValue("mimetype");
                    format = trustedIdentity.getAttributeValue("format");
                } else {
                    format = "Unknown";
                    LOG.warn("FITS unable to conclusively identify file: {}/{}", pid, dsid);
                    LOG.info(new XMLOutputter().outputString(fits));
                }

                // If fedora has a meaningful mimetype already, then override the fits generate one.
                String fedoraMimetype = sourceMimetype.get(dsid);
                if (fedoraMimetype != null && fedoraMimetype.trim().length() > 0
                        && !fedoraMimetype.contains("octet-stream")) {
                    fitsMimetype = fedoraMimetype;
                }

                if ("DATA_FILE".equals(dsid)) {
                    if (fitsMimetype != null) {
                        setExclusiveTripleValue(pid,
                                ContentModelHelper.CDRProperty.hasSourceMimeType.getPredicate(),
                                ContentModelHelper.CDRProperty.hasSourceMimeType.getNamespace(), fitsMimetype, null,
                                foxml);
                    } else { // application/octet-stream
                        setExclusiveTripleValue(pid,
                                ContentModelHelper.CDRProperty.hasSourceMimeType.getPredicate(),
                                ContentModelHelper.CDRProperty.hasSourceMimeType.getNamespace(),
                                "application/octet-stream", null, foxml);
                    }

                    try {
                        Long.parseLong(size);
                        setExclusiveTripleValue(pid,
                                ContentModelHelper.CDRProperty.hasSourceFileSize.getPredicate(),
                                ContentModelHelper.CDRProperty.hasSourceFileSize.getNamespace(), size,
                                "http://www.w3.org/2001/XMLSchema#long", foxml);
                    } catch (NumberFormatException e) {
                        LOG.error("FITS produced a non-integer value for size: " + size);
                    }
                }

                p.addContent(new Element("object", PREMIS_V2_NS)
                        .addContent(new Element("objectIdentifier", PREMIS_V2_NS)
                                .addContent(new Element("objectIdentifierType", PREMIS_V2_NS)
                                        .setText("Fedora Datastream PID"))
                                .addContent(new Element("objectIdentifierValue", PREMIS_V2_NS).setText(dsid)))
                        .addContent(new Element("objectCharacteristics", PREMIS_V2_NS)
                                .addContent(new Element("compositionLevel", PREMIS_V2_NS).setText("0"))
                                .addContent(new Element("fixity", PREMIS_V2_NS)
                                        .addContent(
                                                new Element("messageDigestAlgorithm", PREMIS_V2_NS).setText("MD5"))
                                        .addContent(
                                                new Element("messageDigest", PREMIS_V2_NS).setText(md5checksum)))
                                .addContent(new Element("size", PREMIS_V2_NS).setText(size))
                                .addContent(new Element("format", PREMIS_V2_NS)
                                        .addContent(new Element("formatDesignation", PREMIS_V2_NS).addContent(
                                                new Element("formatName", PREMIS_V2_NS).setText(format))))
                                .addContent(new Element("objectCharacteristicsExtension", PREMIS_V2_NS)
                                        .addContent(ds2FitsDoc.get(dsid).detachRootElement())))
                        .setAttribute("type", PREMIS_V2_NS.getPrefix() + ":file", JDOMNamespaceUtil.XSI_NS));
            }

            // upload tech MD PREMIS XML
            String premisTechURL = service.getManagementClient().upload(premisTech);

            // Add or replace the MD_TECHNICAL datastream for the object
            if (FOXMLJDOMUtil.getDatastream(foxml, ContentModelHelper.Datastream.MD_TECHNICAL.getName()) == null) {
                LOG.debug("Adding FITS output to MD_TECHNICAL");
                String message = "Adding technical metadata derived by FITS";
                service.getManagementClient().addManagedDatastream(pid,
                        ContentModelHelper.Datastream.MD_TECHNICAL.getName(), false, message,
                        new ArrayList<String>(), "PREMIS Technical Metadata", false, "text/xml", premisTechURL);
            } else {
                LOG.debug("Replacing MD_TECHNICAL with new FITS output");
                String message = "Replacing technical metadata derived by FITS";
                service.getManagementClient().modifyDatastreamByReference(pid,
                        ContentModelHelper.Datastream.MD_TECHNICAL.getName(), false, message,
                        new ArrayList<String>(), "PREMIS Technical Metadata", "text/xml", null, null,
                        premisTechURL);
            }

            LOG.debug("Adding techData relationship");
            PID newDSPID = new PID(pid.getPid() + "/" + ContentModelHelper.Datastream.MD_TECHNICAL.getName());
            Map<String, List<String>> rels = service.getTripleStoreQueryService().fetchAllTriples(pid);

            List<String> techrel = rels.get(ContentModelHelper.CDRProperty.techData.toString());
            if (techrel == null || !techrel.contains(newDSPID.getURI())) {
                service.getManagementClient().addObjectRelationship(pid,
                        ContentModelHelper.CDRProperty.techData.toString(), newDSPID);
            }

            LOG.debug("Finished MD_TECHNICAL updating for {}", pid.getPid());
        } catch (FileSystemException e) {
            throw new EnhancementException(e, Severity.FATAL);
        } catch (NotFoundException e) {
            throw new EnhancementException(e, Severity.UNRECOVERABLE);
        } catch (FedoraException e) {
            throw new EnhancementException(e, Severity.RECOVERABLE);
        }
        return result;
    }

    /**
     * Executes fits extract irods script
     * 
     * @param dsIrodsPath
     * @return FITS output XML Document
     */
    private Document runFITS(String dsIrodsPath, String altIds) throws Exception {
        Document result = null;

        // try to extract file name from ALT_ID
        String filename = null;
        if (altIds != null) {
            for (String altid : altIds.split(" ")) {
                if (altid.length() > 0) {
                    String rawPath = altid;
                    // Narrow file name down to after the last /
                    int lastSlash = rawPath.lastIndexOf("/");
                    if (lastSlash > 0)
                        rawPath = rawPath.substring(lastSlash + 1);
                    int ind = rawPath.lastIndexOf(".");
                    // Use text after last . as extension if its length is 0 > len >= MAX_EXTENSION_LENGTH
                    if (ind > 0 && rawPath.length() - 1 > ind && (rawPath.length() - ind <= MAX_EXTENSION_LENGTH)) {
                        filename = rawPath.substring(ind + 1);
                        filename = URIUtil.decode("linkedfile." + filename);
                        break;
                    }
                }
            }
        }

        // execute FITS
        LOG.debug("Run fits for {}", dsIrodsPath);
        BufferedReader reader = null;
        String xmlstr = null;
        String errstr = null;
        try {
            if (filename == null) {
                reader = new BufferedReader(new InputStreamReader(((AbstractIrodsObjectEnhancementService) service)
                        .remoteExecuteWithPhysicalLocation("fitsextract", dsIrodsPath)));
            } else {
                reader = new BufferedReader(new InputStreamReader(((AbstractIrodsObjectEnhancementService) service)
                        .remoteExecuteWithPhysicalLocation("fitsextract", "'" + filename + "'", dsIrodsPath)));
            }
            StringBuilder xml = new StringBuilder();
            StringBuilder err = new StringBuilder();
            boolean blankReached = false;
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                if (line.trim().length() == 0) {
                    blankReached = true;
                    continue;
                } else {
                    if (blankReached) {
                        err.append(line).append("\n");
                    } else {
                        xml.append(line).append("\n");
                    }
                }
            }
            xmlstr = xml.toString();
            errstr = err.toString();
            if (errstr.length() > 0) {
                LOG.warn("FITS is warning for path: " + dsIrodsPath);
                LOG.info(errstr);
            }
            result = new SAXBuilder().build(new StringReader(xmlstr));
            return result;
        } catch (JDOMException e) {
            LOG.warn("Failed to parse FITS output for path: " + dsIrodsPath);
            LOG.info("FITS returned: \n" + xmlstr + "\n\n" + errstr);
            throw e;
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
    }

    public TechnicalMetadataEnhancement(AbstractFedoraEnhancementService service, EnhancementMessage message) {
        super(service, message);
    }
}