de.unigoettingen.sub.commons.contentlib.pdflib.PDFCreator.java Source code

Introduction

Here is the source code for de.unigoettingen.sub.commons.contentlib.pdflib.PDFCreator.java
Source

/*
 * This file is part of the ContentServer project.
 * Visit the websites for more information. 
 *       - http://gdz.sub.uni-goettingen.de 
 *       - http://www.intranda.com 
 *       - http://www.digiverso.com
 * 
 * Copyright 2009, Center for Retrospective Digitization, Gttingen (GDZ),
 * intranda software
 *
 * This is the extended version updated by intranda
 * Copyright 2012, intranda GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.unigoettingen.sub.commons.contentlib.pdflib;

import java.awt.color.ICC_Profile;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.lowagie.text.pdf.PdfPageLabels;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.SimpleBookmark;

import de.unigoettingen.sub.commons.contentlib.exceptions.ImageInterpreterException;
import de.unigoettingen.sub.commons.contentlib.exceptions.ImageManagerException;
import de.unigoettingen.sub.commons.contentlib.exceptions.PDFManagerException;
import de.unigoettingen.sub.commons.contentlib.imagelib.Watermark;
import de.unigoettingen.sub.commons.contentlib.pdflib.DocumentPart.DocumentPartType;
import de.unigoettingen.sub.commons.simplemets.METSParser;
import de.unigoettingen.sub.commons.simplemets.MetadataExtractor;
import de.unigoettingen.sub.commons.simplemets.StructureMetadataExtractor;
import de.unigoettingen.sub.commons.simplemets.exceptions.MetsException;
import de.unigoettingen.sub.commons.util.datasource.UrlImage;

public class PDFCreator {

    private static final Logger LOGGER = Logger.getLogger(PDFCreator.class);

    public void createPDF(OutputStream out, List<DocumentPart> metsparts, PDFConfiguration pdfconfig,
            MetadataExtractor inMetadataExtractor, StructureMetadataExtractor inBookmarkExtractor,
            Watermark myWatermark) throws ImageManagerException, FileNotFoundException, IOException,
            PDFManagerException, ImageInterpreterException, URISyntaxException, MetsException {
        PDFManager pdfmanager = null;

        String creator = "";
        String title = "";
        String keywords = "";

        int documentpartcounter = 0;
        LinkedList<METSParser> allMetsParser = new LinkedList<METSParser>();
        HashMap<Integer, UrlImage> allPages = new HashMap<Integer, UrlImage>();
        HashMap<Integer, String> allPageNames = new HashMap<Integer, String>();
        HashMap<Integer, PDFTitlePage> allTitlePages = new HashMap<Integer, PDFTitlePage>();
        List<PDFBookmark> allRootBookmarks = new LinkedList<PDFBookmark>();

        // iterate over all DocumentParts
        for (DocumentPart dp : metsparts) {

            documentpartcounter++;
            String pdfdivid = null;
            Map<Integer, UrlImage> documentpartPages = null;

            if (dp.getType() == DocumentPartType.METS) {
                // read the METS file and handle all the images
                // TODO: Do not invoke the METS Parser direct, use the interface instead
                METSParser metsparser = new METSParser(dp.getUrl(), true);

                // set METSParser configuration
                metsparser.setMetadataextractor(inMetadataExtractor);
                metsparser.setStructureMetadataExtractor(inBookmarkExtractor);
                if (dp.getMetsFileGroup() != null) {
                    metsparser.setFilegroupsuseattributevalue(dp.getMetsFileGroup());
                }

                if (dp.getDivid() == null) {
                    pdfdivid = metsparser.getUppermostDivIDForPDF();
                } else {
                    pdfdivid = dp.getDivid();
                }

                // calculate metadata
                inMetadataExtractor.calculatePDFMetadata(pdfdivid, metsparser);

                String title1 = inMetadataExtractor.getPdftitle();
                String creator1 = inMetadataExtractor.getPdfcreator();
                String keywords1 = inMetadataExtractor.getPdfkeywords();

                if (title1 != null) {
                    if (title.equals("")) {
                        title = title1;
                    } else {
                        title = title + "; " + title1;
                    }
                }

                if (creator1 != null) {
                    if (creator.equals("")) {
                        creator = creator1;
                    } else {
                        creator = creator + "; " + creator1;
                    }
                }

                if (keywords1 != null) {
                    if (keywords.equals("")) {
                        keywords = keywords1;
                    } else {
                        keywords = keywords + "; " + keywords1;
                    }
                }

                LOGGER.debug("Title1: " + title1);
                LOGGER.debug("Creator1: " + creator1);
                LOGGER.debug("Keywords1: " + keywords1);

                metsparser.getAllFilesForRelatedDivs(pdfdivid); // get page

                // names
                Map<Integer, String> myPageNames = metsparser.getPageNames();

                // get list of files and pagenames
                documentpartPages = metsparser.getImageMap();

                if (documentpartPages.isEmpty()) {
                    // nothing in here; probably METS file has no pages
                    // don't add METS file to list
                    LOGGER.error("No page files / page urls available!");
                } else {
                    // change page names to make them unique
                    // within the PDF, different METSparsers
                    // will have same pageName

                    for (Integer i : documentpartPages.keySet()) {
                        UrlImage page = documentpartPages.get(i);
                        String pagename = myPageNames.get(i);
                        // calculate new integer
                        int dpc = (documentpartcounter * 1000) + i;

                        LOGGER.debug("adding page " + dpc + " to list");

                        // add to new HashMaps
                        allPages.put(dpc, page);
                        allPageNames.put(dpc, pagename);
                    }

                    // handle all bookmarks
                    // need to change page number as well
                    List<PDFBookmark> bookmarks;
                    bookmarks = PDFBookmark.convertList(metsparser.getStructureList());
                    for (PDFBookmark b : bookmarks) {
                        // change page numbers
                        changeBookmarksPagenumber(b, documentpartcounter);
                        allRootBookmarks.add(b);
                    }

                    // add METSParser to list
                    allMetsParser.add(metsparser);
                }
            } else if (dp.getType() == DocumentPartType.PDF) {
                // handle the PDF part
                PdfReader pdfreader = new PdfReader(dp.getUrl());

                int numberofpages = pdfreader.getNumberOfPages();
                for (Integer i = 1; i < numberofpages + 1; i++) {
                    PDFPage pdfpage = new PDFPage();
                    pdfpage.setPdfreader(pdfreader);
                    pdfpage.setPageNumber(i);

                    int dpc = (documentpartcounter * 1000) + (i);
                    LOGGER.debug("adding page " + dpc + " to list");

                    // add page to allPages
                    allPages.put(dpc, pdfpage);

                    // adding page labales
                    String labels[] = PdfPageLabels.getPageLabels(pdfreader);
                    if ((labels != null) && (i < labels.length)) {
                        LOGGER.debug("adding Page label (" + i + "):" + labels[i - 1]);
                        allPageNames.put(dpc, labels[i - 1].substring(0, labels[i - 1].length() - 1));
                    }
                }

                // add Bookmarks
                allRootBookmarks = extractBookmarksFromPDF(pdfreader, allRootBookmarks, documentpartcounter);
            }
            // handle the title page of this DocumentPart
            if (dp.getTitlepage() != null) {
                // title page is available
                // set the layout of the content file

                // set structType
                if (inMetadataExtractor.getStructType() != null) {
                    dp.getTitlepage().setStructuretype(inMetadataExtractor.getStructType());
                }

                dp.getTitlepage().deleteTitleLines();

                // set Lines
                if (inMetadataExtractor.getPdfTitlepageLine1() != null) {
                    PDFTitlePageLine ptl = new PDFTitlePageLine(inMetadataExtractor.getPdfTitlepageLine1());
                    ptl.setContent(inMetadataExtractor.getPdfTitlepageLine1());
                    ptl.setLinetype(2);
                    ptl.setFontsize(14);
                    dp.getTitlepage().addPDFTitlePageLine(ptl);
                }

                if (inMetadataExtractor.getPdfTitlepageLine2() != null) {
                    PDFTitlePageLine ptl = new PDFTitlePageLine(inMetadataExtractor.getPdfTitlepageLine2());
                    ptl.setLinetype(2);
                    ptl.setFontsize(10);
                    dp.getTitlepage().addPDFTitlePageLine(ptl);
                }

                if (inMetadataExtractor.getPdfTitlepageLine3() != null) {
                    PDFTitlePageLine ptl = new PDFTitlePageLine(inMetadataExtractor.getPdfTitlepageLine3());
                    ptl.setLinetype(2);
                    ptl.setFontsize(10);
                    dp.getTitlepage().addPDFTitlePageLine(ptl);
                }

                if (inMetadataExtractor.getPdfTitlepageLine4() != null) {
                    PDFTitlePageLine ptl = new PDFTitlePageLine(inMetadataExtractor.getPdfTitlepageLine4());
                    ptl.setLinetype(2);
                    ptl.setFontsize(10);
                    dp.getTitlepage().addPDFTitlePageLine(ptl);
                }

                // get name of the first page
                if (documentpartPages != null) {
                    Map<Integer, UrlImage> sortedMap = new TreeMap<Integer, UrlImage>(documentpartPages);
                    Iterator<Integer> it2 = sortedMap.keySet().iterator();
                    Integer firstpagename = 0;
                    // TODO: GDZ: Should this just get the first element? - yes
                    // I tried to find a more elegant way but my google didn't work #googleneverlikedme
                    while (it2.hasNext()) {
                        firstpagename = it2.next();
                        firstpagename = (documentpartcounter * 1000) + firstpagename;
                        LOGGER.debug("Adding PDFTitlePage at page " + firstpagename);
                        break;
                    }
                    allTitlePages.put(firstpagename, dp.getTitlepage());
                }
            }
        } // end of while over all document parts

        // setting for PDFManager
        pdfmanager = new PDFManager(allPages);
        pdfmanager.setAlwaysUseRenderedImage(pdfconfig.isPdfDefaultAlwaysUseRenderedImage());
        pdfmanager.setAlwaysCompressToJPEG(pdfconfig.isPdfDefaultAlwaysCompressToJPEG());
        pdfmanager.setPdfa(pdfconfig.isWriteAsPdfA());

        // set pages
        LOGGER.debug(allPages.size() + " pages for PDFManager set");
        pdfmanager.setImageURLs(allPages);
        pdfmanager.setImageNames(allPageNames);
        pdfmanager.setStructureList(allRootBookmarks);
        pdfmanager.setPdftitlepages(allTitlePages);

        // set metadata
        if (!title.equals("")) {
            pdfmanager.setTitle(title);
        }
        if (!creator.equals("")) {
            pdfmanager.setCreator(creator);
            pdfmanager.setAuthor(creator);
        }
        if (!keywords.equals("")) {
            pdfmanager.setKeyword(keywords);
        }

        // set an ICC profile
        if (pdfconfig.getIccinputfilename() != null) {
            ICC_Profile iccprofile = ICC_Profile.getInstance(pdfconfig.getIccinputfilename());
            pdfmanager.setIccprofile(iccprofile);
        }

        pdfmanager.createPDF(out, pdfconfig.getPagesize(), myWatermark);
    }

    private void changeBookmarksPagenumber(PDFBookmark b, Integer i) {
        int imagenumber = b.getImageNumber();

        int newimagenumber = (i * 1000) + imagenumber;
        b.setImageNumber(newimagenumber);

        List<PDFBookmark> children = b.getChildren();
        if ((children != null) && !children.isEmpty()) {
            changeBookmarksPagenumber(children.get(0), i);
        }
    }

    private List<PDFBookmark> extractBookmarksFromPDF(PdfReader pdfreader, List<PDFBookmark> pDFBookmarks,
            int documentpartnumber) {
        // add Bookmarks
        List<?> list = SimpleBookmark.getBookmark(pdfreader);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        if (list != null) {
            try {
                // TODO: GDZ: Do we really need this XML step here?
                SimpleBookmark.exportToXML(list, baos, "UTF8", false);
                String bms = baos.toString();
                LOGGER.debug(bms);
                // parse the xml, find Title elements
                DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
                DocumentBuilder builder = factory.newDocumentBuilder();
                Document doc = builder.parse(new InputSource(new StringReader(bms)));

                Element e = doc.getDocumentElement();
                NodeList nl = e.getChildNodes();
                for (int i = 0; i < nl.getLength(); i++) {
                    Node n = nl.item(i);
                    if ((n.getNodeType() == Node.ELEMENT_NODE) && (n.getNodeName().equals("Title"))) {
                        // found the right node, get title and pagenumber
                        PDFBookmark bm = parseTitleNode(n, documentpartnumber);
                        // this is the uppermost bookmark, add it to rootbookmark list
                        pDFBookmarks.add(bm);
                    }
                }
            } catch (IOException e) {
                LOGGER.error("IOExeption occured", e);
            } catch (ParserConfigurationException e) {
                LOGGER.error("ParserConfigurationException occured", e);
            } catch (SAXException e) {
                LOGGER.error("SAXException occured", e);
            }
        }
        return pDFBookmarks;
    }

    /**
     * parses a single <title> element and its child elements
     * 
     * @param inTitleNode
     * @param documentpartcounter
     * @return a Bookmark element with content and pagenumber
     */
    private PDFBookmark parseTitleNode(Node inTitleNode, int documentpartcounter) {
        String bookmarktitle = getElementValue(inTitleNode);
        NamedNodeMap nnm = inTitleNode.getAttributes();
        Node pageattribute = nnm.getNamedItem("Page");
        String pagenumber[] = pageattribute.getNodeValue().split(" ");

        PDFBookmark bm = new PDFBookmark();
        bm.setContent(bookmarktitle);
        try {
            int pagenumber_int = Integer.parseInt(pagenumber[0]);
            // calculate new pagenumber
            int dpc = (documentpartcounter * 1000) + pagenumber_int;
            bm.setImageNumber(dpc);
        } catch (Exception e) {
            LOGGER.error("Bookmark does not contain a page number");
        }

        // check if this bookmark has further children
        NodeList nl = inTitleNode.getChildNodes();
        for (int i = 0; i < nl.getLength(); i++) {
            Node childnode = nl.item(i);
            if ((childnode.getNodeType() == Node.ELEMENT_NODE) && (childnode.getNodeName().equals("Title"))) {
                PDFBookmark childbookmark = parseTitleNode(childnode, documentpartcounter);
                bm.addChildBookmark(childbookmark);
            }
        }

        return bm;
    }

    private String getElementValue(Node inNode) {
        NodeList nl = inNode.getChildNodes();
        for (int i = 0; i < nl.getLength(); i++) {
            Node n = nl.item(i);
            if (n.getNodeType() == Node.TEXT_NODE) {
                String value = n.getNodeValue();
                return value.trim();
            }
        }
        return null;
    }

}