ddf.catalog.transformer.input.pdf.PdfInputTransformer.java Source code

Java tutorial

Introduction

Here is the source code for ddf.catalog.transformer.input.pdf.PdfInputTransformer.java

Source

/**
 * Copyright (c) Codice Foundation
 * <p/>
 * This is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License as published by the Free Software Foundation, either version 3 of the
 * License, or any later version.
 * <p/>
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details. A copy of the GNU Lesser General Public License
 * is distributed along with this program and can be found at
 * <http://www.gnu.org/licenses/lgpl.html>.
 */
package ddf.catalog.transformer.input.pdf;

import static org.apache.commons.lang3.Validate.notNull;

import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ToTextContentHandler;
import org.codice.ddf.platform.util.TemporaryFileBackedOutputStream;
import org.osgi.framework.Bundle;
import org.osgi.framework.FrameworkUtil;
import org.osgi.framework.ServiceReference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

import com.google.common.net.MediaType;

import ddf.catalog.content.operation.ContentMetadataExtractor;
import ddf.catalog.data.AttributeDescriptor;
import ddf.catalog.data.Metacard;
import ddf.catalog.data.MetacardType;
import ddf.catalog.data.impl.MetacardImpl;
import ddf.catalog.data.impl.MetacardTypeImpl;
import ddf.catalog.data.types.Contact;
import ddf.catalog.data.types.Core;
import ddf.catalog.data.types.Media;
import ddf.catalog.data.types.Topic;
import ddf.catalog.data.types.constants.core.DataType;
import ddf.catalog.transform.CatalogTransformerException;
import ddf.catalog.transform.InputTransformer;
import ddf.catalog.transformer.common.tika.TikaMetadataExtractor;
import ddf.catalog.util.impl.ServiceComparator;

public class PdfInputTransformer implements InputTransformer {

    private static final Logger LOGGER = LoggerFactory.getLogger(PdfInputTransformer.class);

    private final PDDocumentGenerator pdDocumentGenerator;

    private final GeoPdfParser geoParser;

    private final PdfThumbnailGenerator pdfThumbnailGenerator;

    private Map<ServiceReference, ContentMetadataExtractor> contentMetadataExtractors = Collections
            .synchronizedMap(new TreeMap<>(new ServiceComparator()));

    private MetacardType metacardType;

    private boolean usePdfTitleAsTitle;

    /**
     * @param metacardType          must be non-null
     * @param usePdfTitleAsTitle    must be non-null
     * @param pdDocumentGenerator   must be non-null
     * @param geoParser             must be non-null
     * @param pdfThumbnailGenerator must be non-null
     */
    public PdfInputTransformer(MetacardType metacardType, Boolean usePdfTitleAsTitle,
            PDDocumentGenerator pdDocumentGenerator, GeoPdfParser geoParser,
            PdfThumbnailGenerator pdfThumbnailGenerator) {
        notNull(metacardType, "metacardType must be non-null");
        notNull(usePdfTitleAsTitle, "usePdfTitleAsTitle must be non-null");
        notNull(pdDocumentGenerator, "pdDocumentGenerator must be non-null");
        notNull(geoParser, "geoParser must be non-null");
        notNull(pdfThumbnailGenerator, "pdfThumbnailGenerator must be non-null");

        this.metacardType = metacardType;
        this.usePdfTitleAsTitle = usePdfTitleAsTitle;
        this.pdDocumentGenerator = pdDocumentGenerator;
        this.geoParser = geoParser;
        this.pdfThumbnailGenerator = pdfThumbnailGenerator;
    }

    @SuppressWarnings("unused")
    public boolean isUsePdfTitleAsTitle() {
        return usePdfTitleAsTitle;
    }

    /**
     * @param usePdfTitleAsTitle must be non-null
     */
    public void setUsePdfTitleAsTitle(Boolean usePdfTitleAsTitle) {
        notNull(usePdfTitleAsTitle, "usePdfTitleAsTitle must be non-null");
        this.usePdfTitleAsTitle = usePdfTitleAsTitle;
    }

    public void addContentMetadataExtractors(
            ServiceReference<ContentMetadataExtractor> contentMetadataExtractorRef) {
        Bundle bundle = getBundle();
        if (bundle != null) {
            ContentMetadataExtractor cme = bundle.getBundleContext().getService(contentMetadataExtractorRef);
            contentMetadataExtractors.put(contentMetadataExtractorRef, cme);
        }
    }

    Bundle getBundle() {
        return FrameworkUtil.getBundle(PdfInputTransformer.class);
    }

    public void removeContentMetadataExtractor(
            ServiceReference<ContentMetadataExtractor> contentMetadataExtractorRef) {
        contentMetadataExtractors.remove(contentMetadataExtractorRef);
    }

    @Override
    public Metacard transform(InputStream input) throws IOException, CatalogTransformerException {
        return transform(input, null);
    }

    @Override
    public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
        if (contentMetadataExtractors.isEmpty()) {
            return transformWithoutExtractors(input, id);
        } else {
            return transformWithExtractors(input, id);
        }
    }

    private Metacard transformWithoutExtractors(InputStream input, String id) throws IOException {
        try (PDDocument pdfDocument = pdDocumentGenerator.apply(input)) {
            return transformPdf(id, pdfDocument);
        } catch (InvalidPasswordException e) {
            LOGGER.debug("Cannot transform encrypted pdf", e);
            return initializeMetacard(id);
        }
    }

    private Metacard transformWithExtractors(InputStream input, String id)
            throws IOException, CatalogTransformerException {
        try (TemporaryFileBackedOutputStream fbos = new TemporaryFileBackedOutputStream()) {
            try {
                IOUtils.copy(input, fbos);
            } catch (IOException e) {
                throw new CatalogTransformerException("Could not copy bytes of content message.", e);
            }

            String plainText = null;
            try (InputStream isCopy = fbos.asByteSource().openStream()) {
                Parser parser = new AutoDetectParser();
                ContentHandler contentHandler = new ToTextContentHandler();
                TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
                tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
                plainText = contentHandler.toString();
            } catch (CatalogTransformerException e) {
                LOGGER.warn("Cannot extract metadata from pdf", e);
            }

            try (InputStream isCopy = fbos.asByteSource().openStream();
                    PDDocument pdfDocument = pdDocumentGenerator.apply(isCopy)) {

                return transformPdf(id, pdfDocument, plainText);
            } catch (InvalidPasswordException e) {
                LOGGER.debug("Cannot transform encrypted pdf", e);
                return initializeMetacard(id);
            }
        }
    }

    private MetacardImpl initializeMetacard(String id) {
        return initializeMetacard(id, null);
    }

    private MetacardImpl initializeMetacard(String id, String contentInput) {
        MetacardImpl metacard;

        if (StringUtils.isNotBlank(contentInput)) {
            Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream()
                    .map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream)
                    .collect(Collectors.toSet());

            metacard = new MetacardImpl(new MetacardTypeImpl(metacardType.getName(), metacardType, attributes));

            for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
                contentMetadataExtractor.process(contentInput, metacard);
            }
        } else {
            metacard = new MetacardImpl(metacardType);
        }

        metacard.setId(id);
        metacard.setContentTypeName(MediaType.PDF.toString());
        metacard.setAttribute(Media.TYPE, MediaType.PDF.toString());
        metacard.setAttribute(Core.DATATYPE, DataType.DOCUMENT.toString());

        return metacard;
    }

    private Metacard transformPdf(String id, PDDocument pdfDocument) throws IOException {
        return transformPdf(id, pdfDocument, null);
    }

    private Metacard transformPdf(String id, PDDocument pdfDocument, String contentInput) throws IOException {
        MetacardImpl metacard = initializeMetacard(id, contentInput);

        if (pdfDocument.isEncrypted()) {
            LOGGER.debug("Cannot transform encrypted pdf");
            return metacard;
        }

        extractPdfMetadata(pdfDocument, metacard);

        pdfThumbnailGenerator.apply(pdfDocument).ifPresent(metacard::setThumbnail);

        Optional.ofNullable(geoParser.apply(pdfDocument)).ifPresent(metacard::setLocation);

        return metacard;
    }

    /**
     * @param pdfDocument PDF document
     * @param metacard    A mutable metacard to add the extracted data to
     */
    private void extractPdfMetadata(PDDocument pdfDocument, MetacardImpl metacard) {

        PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation();

        setDateIfNotNull(documentInformation.getCreationDate(), metacard, Metacard.CREATED);

        setDateIfNotNull(documentInformation.getModificationDate(), metacard, Metacard.MODIFIED);

        if (usePdfTitleAsTitle) {
            setIfNotBlank(documentInformation.getTitle(), metacard, Metacard.TITLE);
        }

        setIfNotBlank(documentInformation.getAuthor(), metacard, Contact.CREATOR_NAME);

        setIfNotBlank(documentInformation.getSubject(), metacard, Metacard.DESCRIPTION);

        setIfNotBlank(documentInformation.getKeywords(), metacard, Topic.KEYWORD);

    }

    private void setDateIfNotNull(Calendar calendar, MetacardImpl metacard, String attributeName) {
        if (calendar != null) {
            metacard.setAttribute(attributeName, calendar.getTime());
        }
    }

    private void setIfNotBlank(String pdfDocumentValue, MetacardImpl metacard, String attributeName) {
        if (StringUtils.isNotBlank(pdfDocumentValue)) {
            metacard.setAttribute(attributeName, pdfDocumentValue);
        }
    }

}