PDFConverter.java Source code

Introduction

Here is the source code for PDFConverter.java
Source

/**
 *
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 */
//package org.opensextant.xtext.converters;

import java.io.IOException;
import java.io.StringWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.iConvert;

/**
 * Retired PDF Converter.
 * 
 * @deprecated  Tika's PDF parser does a better job.  The extra metadata fields I was grabbing here are not that useful. 
 * Handling of encrypted PDF documents is still suspect.
 * 
 * @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
 */
public class PDFConverter implements iConvert {

    private PDFTextStripper stripper = null;

    /**
     * Initialize a reusable PDF engine.
     * @throws java.io.IOException
     */
    public PDFConverter() throws IOException {
        stripper = new PDFTextStripper();

    }

    @Override
    public synchronized ConvertedDocument convert(String data) throws IOException {
        throw new IOException("PDF conversion as text blob is not supported here.  Send a File obj");
    }

    /**
     * Implementation is informed by PDFBox authors.
     *
     * @param doc
     * @return
     * @throws IOException
     */
    @Override
    public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

        /*
         * Licensed to the Apache Software Foundation (ASF) under one or more
         * contributor license agreements.  See the NOTICE file distributed with
         * this work for additional information regarding copyright ownership.
         * The ASF licenses this file to You under the Apache License, Version 2.0
         * (the "License"); you may not use this file except in compliance with
         * the License.  You may obtain a copy of the License at
         *
         *      http://www.apache.org/licenses/LICENSE-2.0
         *
         * Unless required by applicable law or agreed to in writing, software
         * distributed under the License is distributed on an "AS IS" BASIS,
         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         * See the License for the specific language governing permissions and
         * limitations under the License.
         */
        /**
         * Adapted from LucenePDFDocument.java from PDFBox lucene project
         *
         * This class is used to create a document for the lucene search engine.
         * This should easily plug into the IndexHTML or IndexFiles that comes
         * with the lucene project. This class will populate the following
         * fields.
         * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
         * <tr>
         * <td>path</td> <td>File system path if loaded from a file</td> </tr>
         * <tr>
         * <td>url</td> <td>URL to PDF document</td> </tr> <tr>
         * <td>contents</td>
         * <td>Entire contents of PDF document, indexed but not stored</td>
         * </tr>
         * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
         * <tr>
         * <td>modified</td> <td>The modified date/time according to the url or
         * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
         * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
         * <td>From PDF meta-data if available</td> </tr> <tr>
         * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
         * </table>
         *
         * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
         * @version $Revision: 1.23 $
         *
         * @throws IOException If there is an error parsing the document.
         */
        PDDocument pdfDocument = null;
        ConvertedDocument textdoc = new ConvertedDocument(doc);

        try {
            pdfDocument = PDDocument.load(doc);

            if (pdfDocument.isEncrypted()) {
                //Just try using the default password and move on
                // Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
                textdoc.addProperty("encrypted", "YES");
            }

            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            stripper.resetEngine();
            stripper.writeText(pdfDocument, writer);

            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if (info != null) {
                textdoc.addAuthor(info.getAuthor());
                try {
                    textdoc.addCreateDate(info.getCreationDate());
                } catch (IOException io) {
                    //ignore, bad date but continue with indexing
                }
                textdoc.addProperty("creator_tool", info.getCreator());
                textdoc.addProperty("keywords", info.getKeywords());
                /* try {
                 metadata.add("ModificationDate", info.getModificationDate());
                 } catch (IOException io) {
                 //ignore, bad date but continue with indexing
                 } */
                //metadata.add("Producer", info.getProducer());
                textdoc.addProperty("subject", info.getSubject());
                String ttl = info.getTitle();
                if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                    ttl = textdoc.filename;
                }
                textdoc.addTitle(ttl);
                // metadata.add("Trapped", info.getTrapped());

                // TODO: Character set is what?
                textdoc.setEncoding("UTF-8");
            }

            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
            // is shared as long as the buffer content is not modified, which will
            // not occur here.
            textdoc.setText(writer.getBuffer().toString());

            return textdoc;

        } finally {
            if (pdfDocument != null) {
                pdfDocument.close();
            }
        }
    }
}