org.exoplatform.services.document.impl.PDFDocumentReader.java Source code

Java tutorial

Introduction

Here is the source code for org.exoplatform.services.document.impl.PDFDocumentReader.java

Source

/*
 * Copyright (C) 2009 eXo Platform SAS.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.exoplatform.services.document.impl;

import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchemaBasic;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
import org.apache.jempbox.xmp.XMPSchemaPDF;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.util.PDFTextStripper;
import org.exoplatform.commons.utils.SecurityHelper;
import org.exoplatform.services.document.DCMetaData;
import org.exoplatform.services.document.DocumentReadException;
import org.exoplatform.services.log.ExoLogger;
import org.exoplatform.services.log.Log;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.util.Calendar;
import java.util.Properties;

/**
 * Created by The eXo Platform SAS A parser of Adobe PDF files.
 * 
 * @author Phung Hai Nam
 * @author Gennady Azarenkov
 * @version Oct 19, 2005
 */
public class PDFDocumentReader extends BaseDocumentReader {

    protected static final Log LOG = ExoLogger.getLogger("exo.core.component.document.PDFDocumentReader");

    /**
     * Get the application/pdf mime type.
     * 
     * @return The application/pdf mime type.
     */
    public String[] getMimeTypes() {
        return new String[] { "application/pdf" };
    }

    /**
     * Returns only a text from pdf file content.
     * 
     * @param is an input stream with .pdf file content.
     * @return The string only with text from file content.
     */
    public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {

        try {
            return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<String>() {
                public String run() throws Exception {
                    if (is == null) {
                        throw new IllegalArgumentException("InputStream is null.");
                    }
                    PDDocument pdDocument = null;
                    StringWriter sw = new StringWriter();
                    try {
                        if (is.available() == 0)
                            return "";

                        try {
                            pdDocument = PDDocument.load(is);
                        } catch (IOException e) {
                            throw new DocumentReadException("Can not load PDF document.", e);
                        }

                        PDFTextStripper stripper = new PDFTextStripper();
                        stripper.setStartPage(1);
                        stripper.setEndPage(Integer.MAX_VALUE);
                        stripper.writeText(pdDocument, sw);
                    } finally {
                        if (pdDocument != null)
                            try {
                                pdDocument.close();
                            } catch (IOException e) {
                                if (LOG.isTraceEnabled()) {
                                    LOG.trace("An exception occurred: " + e.getMessage());
                                }
                            }
                        if (is != null)
                            try {
                                is.close();
                            } catch (IOException e) {
                                if (LOG.isTraceEnabled()) {
                                    LOG.trace("An exception occurred: " + e.getMessage());
                                }
                            }
                    }
                    return sw.toString();
                }
            });

        } catch (PrivilegedActionException pae) {
            Throwable cause = pae.getCause();
            if (cause instanceof IOException) {
                throw (IOException) cause;
            } else if (cause instanceof RuntimeException) {
                throw (RuntimeException) cause;
            } else {
                throw new RuntimeException(cause);
            }
        }

    }

    public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException {
        // Ignore encoding
        return getContentAsText(is);
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.exoplatform.services.document.DocumentReader#getProperties(java.io.
     *      InputStream)
     */
    public Properties getProperties(final InputStream is) throws IOException, DocumentReadException {
        try {
            return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() {
                public Properties run() throws Exception {
                    if (is == null) {
                        throw new IllegalArgumentException("InputStream is null.");
                    }

                    PDDocument pdDocument = PDDocument.load(is);
                    Properties props = new Properties();
                    try {
                        if (pdDocument.isEncrypted()) {
                            try {
                                pdDocument.decrypt("");
                            } catch (InvalidPasswordException e) {
                                throw new DocumentReadException("The pdf document is encrypted.", e);
                            } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
                                throw new DocumentReadException(e.getMessage(), e);
                            }
                        }

                        PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
                        PDMetadata meta = catalog.getMetadata();
                        if (meta != null) {
                            XMPMetadata metadata = meta.exportXMPMetadata();

                            XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
                            if (dc != null) {
                                try {
                                    if (dc.getTitle() != null)
                                        props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                                } catch (Exception e) {
                                    LOG.warn("getTitle failed: " + e.getMessage());
                                }
                                try {
                                    if (dc.getDescription() != null)
                                        props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                                } catch (Exception e) {
                                    LOG.warn("getSubject failed: " + e.getMessage());
                                }

                                try {
                                    if (dc.getCreators() != null) {
                                        for (String creator : dc.getCreators()) {
                                            props.put(DCMetaData.CREATOR, fixEncoding(creator));
                                        }
                                    }
                                } catch (Exception e) {
                                    LOG.warn("getCreator failed: " + e.getMessage());
                                }

                                try {
                                    if (dc.getDates() != null) {
                                        for (Calendar date : dc.getDates()) {
                                            props.put(DCMetaData.DATE, date);
                                        }
                                    }
                                } catch (Exception e) {
                                    LOG.warn("getDate failed: " + e.getMessage());
                                }
                            }

                            XMPSchemaPDF pdf = metadata.getPDFSchema();
                            if (pdf != null) {
                                try {
                                    if (pdf.getKeywords() != null)
                                        props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                                } catch (Exception e) {
                                    LOG.warn("getKeywords failed: " + e.getMessage());
                                }

                                try {
                                    if (pdf.getProducer() != null)
                                        props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                                } catch (Exception e) {
                                    LOG.warn("getProducer failed: " + e.getMessage());
                                }
                            }

                            XMPSchemaBasic basic = metadata.getBasicSchema();
                            if (basic != null) {
                                try {
                                    if (basic.getCreateDate() != null)
                                        props.put(DCMetaData.DATE, basic.getCreateDate());
                                } catch (Exception e) {
                                    LOG.warn("getCreationDate failed: " + e.getMessage());
                                }
                                try {
                                    if (basic.getModifyDate() != null)
                                        props.put(DCMetaData.DATE, basic.getModifyDate());
                                } catch (Exception e) {
                                    LOG.warn("getModificationDate failed: " + e.getMessage());
                                }

                                // DCMetaData.PUBLISHER - basic.getCreatorTool()
                            }
                        }

                        if (props.isEmpty()) {
                            // The pdf doesn't contain any metadata, try to use the document
                            // information instead
                            PDDocumentInformation docInfo = pdDocument.getDocumentInformation();

                            if (docInfo != null) {
                                try {
                                    if (docInfo.getAuthor() != null)
                                        props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                                } catch (Exception e) {
                                    LOG.warn("getAuthor failed: " + e.getMessage());
                                }
                                try {
                                    if (docInfo.getCreationDate() != null)
                                        props.put(DCMetaData.DATE, docInfo.getCreationDate());
                                } catch (Exception e) {
                                    LOG.warn("getCreationDate failed: " + e.getMessage());
                                }
                                try {
                                    if (docInfo.getCreator() != null)
                                        props.put(DCMetaData.CREATOR, docInfo.getCreator());
                                } catch (Exception e) {
                                    LOG.warn("getCreator failed: " + e.getMessage());
                                }
                                try {

                                    if (docInfo.getKeywords() != null)
                                        props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                                } catch (Exception e) {
                                    LOG.warn("getKeywords failed: " + e.getMessage());
                                }
                                try {
                                    if (docInfo.getModificationDate() != null)
                                        props.put(DCMetaData.DATE, docInfo.getModificationDate());
                                } catch (Exception e) {
                                    LOG.warn("getModificationDate failed: " + e.getMessage());
                                }
                                try {
                                    if (docInfo.getProducer() != null)
                                        props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                                } catch (Exception e) {
                                    LOG.warn("getProducer failed: " + e.getMessage());
                                }
                                try {
                                    if (docInfo.getSubject() != null)
                                        props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                                } catch (Exception e) {
                                    LOG.warn("getSubject failed: " + e.getMessage());
                                }
                                try {
                                    if (docInfo.getTitle() != null)
                                        props.put(DCMetaData.TITLE, docInfo.getTitle());
                                } catch (Exception e) {
                                    LOG.warn("getTitle failed: " + e.getMessage());
                                }

                                // docInfo.getTrapped();
                            }
                        }
                    } finally {
                        if (pdDocument != null) {
                            pdDocument.close();
                        }

                        if (is != null) {
                            try {
                                is.close();
                            } catch (IOException e) {
                                if (LOG.isTraceEnabled()) {
                                    LOG.trace("An exception occurred: " + e.getMessage());
                                }
                            }
                        }
                    }
                    return props;
                }
            });

        } catch (PrivilegedActionException pae) {
            Throwable cause = pae.getCause();
            if (cause instanceof IOException) {
                throw (IOException) cause;
            } else if (cause instanceof RuntimeException) {
                throw (RuntimeException) cause;
            } else {
                throw new RuntimeException(cause);
            }
        }
    }

    private String fixEncoding(String str) {
        try {
            String encoding = null;
            int orderMaskOffset = 0;

            if (str.startsWith("\\000\\000\\376\\377")) {
                encoding = "UTF-32BE";
                orderMaskOffset = 16;
            } else if (str.startsWith("\\377\\376\\000\\000")) {
                encoding = "UTF-32LE";
                orderMaskOffset = 16;
            } else if (str.startsWith("\\376\\377")) {
                encoding = "UTF-16BE";
                orderMaskOffset = 8;
            } else if (str.startsWith("\\377\\376")) {
                encoding = "UTF-16LE";
                orderMaskOffset = 8;
            }

            if (encoding == null) {
                // return default
                return str;
            } else {
                int i = orderMaskOffset, len = str.length();
                char c;
                StringBuilder sb = new StringBuilder(len);
                while (i < len) {
                    c = str.charAt(i++);
                    if (c == '\\') {
                        if (i + 3 <= len) {
                            //extract octal-code
                            try {
                                c = (char) Integer.parseInt(str.substring(i, i + 3), 8);
                                i += 3;
                            } catch (NumberFormatException e) {
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("PDF metadata exctraction warning: can not decode octal code - "
                                            + str.substring(i - 1, i + 3) + ".", e);
                                }
                            }
                        } else {
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("PDF metadata exctraction warning: octal code is not complete - "
                                        + str.substring(i - 1, len));
                            }
                        }
                    }
                    sb.append(c);
                }

                byte[] bytes = sb.toString().getBytes();
                return new String(bytes, encoding);
            }
        } catch (UnsupportedEncodingException e) {
            LOG.warn("PDF metadata exctraction warning: can not convert metadata string " + str, e);
            return "";
        }
    }
}