Java tutorial
/* * Copyright (C) 2009 eXo Platform SAS. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.exoplatform.services.document.impl; import org.apache.jempbox.xmp.XMPMetadata; import org.apache.jempbox.xmp.XMPSchemaBasic; import org.apache.jempbox.xmp.XMPSchemaDublinCore; import org.apache.jempbox.xmp.XMPSchemaPDF; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.util.PDFTextStripper; import org.exoplatform.commons.utils.SecurityHelper; import org.exoplatform.services.document.DCMetaData; import org.exoplatform.services.document.DocumentReadException; import org.exoplatform.services.log.ExoLogger; import org.exoplatform.services.log.Log; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.security.PrivilegedActionException; import java.security.PrivilegedExceptionAction; import java.util.Calendar; import java.util.Properties; /** * Created by The eXo Platform SAS A parser of Adobe PDF files. * * @author Phung Hai Nam * @author Gennady Azarenkov * @version Oct 19, 2005 */ public class PDFDocumentReader extends BaseDocumentReader { protected static final Log LOG = ExoLogger.getLogger("exo.core.component.document.PDFDocumentReader"); /** * Get the application/pdf mime type. * * @return The application/pdf mime type. */ public String[] getMimeTypes() { return new String[] { "application/pdf" }; } /** * Returns only a text from pdf file content. * * @param is an input stream with .pdf file content. * @return The string only with text from file content. */ public String getContentAsText(final InputStream is) throws IOException, DocumentReadException { try { return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<String>() { public String run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = null; StringWriter sw = new StringWriter(); try { if (is.available() == 0) return ""; try { pdDocument = PDDocument.load(is); } catch (IOException e) { throw new DocumentReadException("Can not load PDF document.", e); } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(Integer.MAX_VALUE); stripper.writeText(pdDocument, sw); } finally { if (pdDocument != null) try { pdDocument.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } if (is != null) try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } return sw.toString(); } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } } public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException { // Ignore encoding return getContentAsText(is); } /* * (non-Javadoc) * * @see org.exoplatform.services.document.DocumentReader#getProperties(java.io. * InputStream) */ public Properties getProperties(final InputStream is) throws IOException, DocumentReadException { try { return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() { public Properties run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = PDDocument.load(is); Properties props = new Properties(); try { if (pdDocument.isEncrypted()) { try { pdDocument.decrypt(""); } catch (InvalidPasswordException e) { throw new DocumentReadException("The pdf document is encrypted.", e); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { throw new DocumentReadException(e.getMessage(), e); } } PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { XMPMetadata metadata = meta.exportXMPMetadata(); XMPSchemaDublinCore dc = metadata.getDublinCoreSchema(); if (dc != null) { try { if (dc.getTitle() != null) props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle())); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } try { if (dc.getDescription() != null) props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription())); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (dc.getCreators() != null) { for (String creator : dc.getCreators()) { props.put(DCMetaData.CREATOR, fixEncoding(creator)); } } } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (dc.getDates() != null) { for (Calendar date : dc.getDates()) { props.put(DCMetaData.DATE, date); } } } catch (Exception e) { LOG.warn("getDate failed: " + e.getMessage()); } } XMPSchemaPDF pdf = metadata.getPDFSchema(); if (pdf != null) { try { if (pdf.getKeywords() != null) props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords())); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (pdf.getProducer() != null) props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer())); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } } XMPSchemaBasic basic = metadata.getBasicSchema(); if (basic != null) { try { if (basic.getCreateDate() != null) props.put(DCMetaData.DATE, basic.getCreateDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (basic.getModifyDate() != null) props.put(DCMetaData.DATE, basic.getModifyDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } // DCMetaData.PUBLISHER - basic.getCreatorTool() } } if (props.isEmpty()) { // The pdf doesn't contain any metadata, try to use the document // information instead PDDocumentInformation docInfo = pdDocument.getDocumentInformation(); if (docInfo != null) { try { if (docInfo.getAuthor() != null) props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor()); } catch (Exception e) { LOG.warn("getAuthor failed: " + e.getMessage()); } try { if (docInfo.getCreationDate() != null) props.put(DCMetaData.DATE, docInfo.getCreationDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (docInfo.getCreator() != null) props.put(DCMetaData.CREATOR, docInfo.getCreator()); } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (docInfo.getKeywords() != null) props.put(DCMetaData.SUBJECT, docInfo.getKeywords()); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (docInfo.getModificationDate() != null) props.put(DCMetaData.DATE, docInfo.getModificationDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } try { if (docInfo.getProducer() != null) props.put(DCMetaData.PUBLISHER, docInfo.getProducer()); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } try { if (docInfo.getSubject() != null) props.put(DCMetaData.DESCRIPTION, docInfo.getSubject()); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (docInfo.getTitle() != null) props.put(DCMetaData.TITLE, docInfo.getTitle()); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } // docInfo.getTrapped(); } } } finally { if (pdDocument != null) { pdDocument.close(); } if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return props; } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } } private String fixEncoding(String str) { try { String encoding = null; int orderMaskOffset = 0; if (str.startsWith("\\000\\000\\376\\377")) { encoding = "UTF-32BE"; orderMaskOffset = 16; } else if (str.startsWith("\\377\\376\\000\\000")) { encoding = "UTF-32LE"; orderMaskOffset = 16; } else if (str.startsWith("\\376\\377")) { encoding = "UTF-16BE"; orderMaskOffset = 8; } else if (str.startsWith("\\377\\376")) { encoding = "UTF-16LE"; orderMaskOffset = 8; } if (encoding == null) { // return default return str; } else { int i = orderMaskOffset, len = str.length(); char c; StringBuilder sb = new StringBuilder(len); while (i < len) { c = str.charAt(i++); if (c == '\\') { if (i + 3 <= len) { //extract octal-code try { c = (char) Integer.parseInt(str.substring(i, i + 3), 8); i += 3; } catch (NumberFormatException e) { if (LOG.isDebugEnabled()) { LOG.debug("PDF metadata exctraction warning: can not decode octal code - " + str.substring(i - 1, i + 3) + ".", e); } } } else { if (LOG.isDebugEnabled()) { LOG.debug("PDF metadata exctraction warning: octal code is not complete - " + str.substring(i - 1, len)); } } } sb.append(c); } byte[] bytes = sb.toString().getBytes(); return new String(bytes, encoding); } } catch (UnsupportedEncodingException e) { LOG.warn("PDF metadata exctraction warning: can not convert metadata string " + str, e); return ""; } } }