Example usage for org.apache.pdfbox.pdmodel PDDocument close

List of usage examples for org.apache.pdfbox.pdmodel PDDocument close

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

This will close the underlying COSDocument object.

Usage

From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java

License:Open Source License

@Test
public void givenPdfWith2Fonts_extractFontsToDir_thenDirectoryHasThreeTtfFiles() throws IOException {
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf"));
    PdfFontExtractor extractor = new PdfFontExtractor();

    File extractDir = folder.getRoot();
    extractor.extractFontsToDir(doc, extractDir);
    File[] fontFiles = extractDir.listFiles();

    Assert.assertEquals(3, fontFiles.length);
    for (File fileOn : fontFiles)
        Assert.assertEquals("ttf", FilenameUtils.getExtension(fileOn.getPath()));

    doc.close();
}

From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java

License:Open Source License

@Test
public void givenPdfWith2Fonts_extractFontsToDirWithWoff1FormatSet_thenDirectoryHasThreeWoffFiles()
        throws IOException {
    File extractDir = folder.getRoot();
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf"));

    PdfFontExtractor extractor = new PdfFontExtractor();
    extractor.setExtractFormat(FontVerter.FontFormat.WOFF1);

    extractor.extractFontsToDir(doc, extractDir);
    File[] fontFiles = extractDir.listFiles();

    Assert.assertEquals(3, fontFiles.length);
    for (File fileOn : fontFiles)
        Assert.assertEquals("woff", FilenameUtils.getExtension(fileOn.getPath()));

    doc.close();
}

From source file:org.mitre.xtext.converters.PDFConverter.java

License:Apache License

/** Implementation is informed by PDFBox authors.
 *///w  w  w .  j  a  v  a  2s .c  o m
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /**
     * Adapted from LucenePDFDocument.java from PDFBox lucene project
     *
     * This class is used to create a document for the lucene search engine.
     * This should easily plug into the IndexHTML or IndexFiles that comes with
     * the lucene project. This class will populate the following fields.
     * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr>
     * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr>
     * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td>
     * <td>Entire contents of PDF document, indexed but not stored</td> </tr>
     * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr>
     * <td>modified</td> <td>The modified date/time according to the url or
     * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene
     * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF
     * meta-data if available</td> </tr> </table>
     *
     * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
     * @version $Revision: 1.23 $
     *
     * @throws IOException If there is an error parsing the document.
     */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    try {
        pdfDocument = PDDocument.load(doc);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            /**
             *
             * Exception in thread "main" java.lang.NoClassDefFoundError:
             * org/bouncycastle/jce/provider/BouncyCastleProvider at
             * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090)
             * at
             * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594)
             *
             * CRYPTO stuff -- load BouncyCastle crypto JAR files. try {
             * pdfDocument.decrypt(""); } catch (CryptographyException e) {
             * throw new IOException("Error decrypting document(" + pdf_file
             * + "): " + e); } catch (InvalidPasswordException e) { //they
             * didn't suppply a password and the default of "" was wrong.
             * throw new IOException( "Error: The document(" + pdf_file + ")
             * is encrypted "); } finally { if (pdfDocument != null) {
             * pdfDocument.close();} }
             */
            textdoc.addProperty("encrypted", "YES");
        } else {

            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            stripper.resetEngine();
            stripper.writeText(pdfDocument, writer);

            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if (info != null) {
                textdoc.addAuthor(info.getAuthor());
                try {
                    textdoc.addCreateDate(info.getCreationDate());
                } catch (IOException io) {
                    //ignore, bad date but continue with indexing
                }
                textdoc.addProperty("creator_tool", info.getCreator());
                textdoc.addProperty("keywords", info.getKeywords());
                /* try {
                 metadata.add("ModificationDate", info.getModificationDate());
                 } catch (IOException io) {
                 //ignore, bad date but continue with indexing
                 } */
                //metadata.add("Producer", info.getProducer());
                textdoc.addProperty("subject", info.getSubject());
                String ttl = info.getTitle();
                if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                    ttl = textdoc.filename;
                }
                textdoc.addTitle(ttl);
                // metadata.add("Trapped", info.getTrapped());

                // TODO: Character set is what?
                textdoc.setEncoding("UTF-8");
            }

            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
            // is shared as long as the buffer content is not modified, which will
            // not occur here.
            textdoc.setPayload(writer.getBuffer().toString());
        }
        return textdoc;

    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.mycore.iview2.frontend.MCRPDFTools.java

License:Open Source License

static BufferedImage getThumbnail(Path pdfFile, int thumbnailSize, boolean centered) throws IOException {
    InputStream fileIS = Files.newInputStream(pdfFile);
    PDDocument pdf = PDDocument.load(fileIS);
    try {/*from  w ww .ja  v a  2 s  .  c  o m*/
        PDFRenderer pdfRenderer = new PDFRenderer(pdf);
        BufferedImage level1Image = pdfRenderer.renderImage(0);
        int imageType = BufferedImage.TYPE_INT_ARGB;

        if (!centered) {
            return level1Image;
        }
        final double width = level1Image.getWidth();
        final double height = level1Image.getHeight();
        LOGGER.info("new PDFBox: " + width + "x" + height);
        LOGGER.info("temporary image dimensions: " + width + "x" + height);
        final int newWidth = width < height ? (int) Math.ceil(thumbnailSize * width / height) : thumbnailSize;
        final int newHeight = width < height ? thumbnailSize : (int) Math.ceil(thumbnailSize * height / width);
        //if centered make thumbnailSize x thumbnailSize image
        final BufferedImage bicubic = new BufferedImage(centered ? thumbnailSize : newWidth,
                centered ? thumbnailSize : newHeight, imageType);
        LOGGER.info("target image dimensions: " + bicubic.getWidth() + "x" + bicubic.getHeight());
        final Graphics2D bg = bicubic.createGraphics();
        bg.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC);
        int x = centered ? (thumbnailSize - newWidth) / 2 : 0;
        int y = centered ? (thumbnailSize - newHeight) / 2 : 0;
        if (x != 0 && y != 0) {
            LOGGER.warn("Writing at position " + x + "," + y);
        }
        bg.drawImage(level1Image, x, y, x + newWidth, y + newHeight, 0, 0, (int) Math.ceil(width),
                (int) Math.ceil(height), null);
        bg.dispose();
        return bicubic;
    } finally {
        pdf.close();
    }
}

From source file:org.mycore.media.MCRMediaPDFParser.java

License:Open Source License

/**
 * Parse file and store metadata in related Object.
 * //from   www  .  ja v a  2s .  c  om
 * @return MCRMediaObject
 *              can be held any MCRMediaObject
 * @see MCRMediaObject#clone()
 */
@SuppressWarnings("unchecked")
public synchronized MCRMediaObject parse(File file) throws Exception {
    if (!file.exists())
        throw new IOException("File \"" + file.getName() + "\" doesn't exists!");

    MCRPDFObject media = new MCRPDFObject();

    LOGGER.info("parse " + file.getName() + "...");

    PDDocument pdf = PDDocument.load(file);
    try {
        media.fileName = file.getName();
        media.fileSize = file.length();
        media.folderName = (file.getAbsolutePath()).replace(file.getName(), "");

        PDPageTree pages = pdf.getDocumentCatalog().getPages();

        media.numPages = pdf.getNumberOfPages();

        PDPage page = (PDPage) pages.get(0);
        PDRectangle rect = page.getMediaBox();

        media.width = Math.round(rect.getWidth());
        media.height = Math.round(rect.getHeight());

        PDDocumentInformation info = pdf.getDocumentInformation();
        if (info != null) {
            media.tags = new MCRMediaTagObject();
            media.tags.author = info.getAuthor();
            media.tags.creator = info.getCreator();
            media.tags.producer = info.getProducer();
            media.tags.title = info.getTitle();
            media.tags.subject = info.getSubject();
            media.tags.keywords = info.getKeywords();
        }
    } catch (Exception e) {
        LOGGER.error(e.getMessage());
        throw new Exception(e.getMessage());
    } finally {
        pdf.close();
    }

    return media;
}

From source file:org.nines.RdfTextSpider.java

License:Apache License

/**
 * Extract the text from the PDF specified by the URI
 * @param uri// w w w  . ja  v  a 2s  .c  o  m
 * @return
 * @throws IOException 
 */
private byte[] scrapeExternalPDF(final String uri) throws IOException {
    InputStream is = null;
    GetMethod get = new GetMethod(uri);
    ;
    PDDocument pdfDoc = null;
    try {
        int result;
        result = httpClient.executeMethod(get);
        if (result != 200) {
            throw new IOException(result + " code returned for URL: " + uri);
        }
        is = get.getResponseBodyAsStream();
        pdfDoc = PDDocument.load(is);
        PDFTextStripper pdfStrip = new PDFTextStripper();
        return pdfStrip.getText(pdfDoc).getBytes();

    } catch (IOException e) {
        throw e; // just rethrow it
    } finally {
        try {
            get.releaseConnection();
            IOUtils.closeQuietly(is);
            if (pdfDoc != null) {
                pdfDoc.close();
            }
        } catch (Exception e) {
        }
    }
}

From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java

License:Apache License

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters)
        throws ConversionException {

    PDDocument document = null;
    File f = null;/*from   w  w  w .ja va2s  . co m*/
    OutputStream fas = null;
    try {
        document = PDDocument.load(blobHolder.getBlob().getStream());
        // NXP-1556: if document is protected an IOException will be raised
        // Instead of catching the exception based on its message string
        // lets avoid sending messages that will generate this error
        // code taken from PDFTextStripper.writeText source.
        // only care about standard encryption and if it was decrypted with
        // the user password
        AccessPermission permission = document.getCurrentAccessPermission();
        if (permission.canExtractContent()) {
            PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();

            // use the position information to heuristically organize the
            // extracted paragraphs. This is also important for
            // right-to-left languages.
            textStripper.setSortByPosition(true);

            String text = textStripper.getText(document);
            // replace non breaking space by regular spaces (why?)
            // text = text.replace("\u00a0", " ");
            f = Framework.createTempFile("pdfboplugin", ".txt");
            fas = new FileOutputStream(f);
            fas.write(text.getBytes("UTF-8"));
            try (FileInputStream is = new FileInputStream(f)) {
                Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8");
                return new SimpleCachableBlobHolder(blob);
            }
        } else {
            return new SimpleCachableBlobHolder(Blobs.createBlob(""));
        }
    } catch (IOException e) {
        throw new ConversionException("Error during text extraction with PDFBox", e);
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                log.error("Error while closing PDFBox document", e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java

License:Apache License

public static String readPdfText(File pdfFile) throws IOException {
    PDFTextStripper textStripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(pdfFile);
    String text = textStripper.getText(document);
    document.close();
    return text.trim();
}

From source file:org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java

License:Open Source License

/**
 * Extracts the text from a PDF file.//w  ww  .  jav a  2 s  .c  om
 *
 * @return the document content as plain text
 */
public static String readPdfText(File pdfFile) throws IOException {
    PDFTextStripper textStripper = new PDFTextStripper();
    PDDocument document = PDDocument.load(pdfFile);
    String text = textStripper.getText(document);
    document.close();
    return text.trim();
}

From source file:org.nuxeo.pdf.PDFMerge.java

License:Open Source License

/**
 * Merge the PDFs. optionnaly, can set the title, subject and author of the
 * resulting PDF.//from  ww w .  j  a v a  2  s  . c om
 * <p>
 * <b>Notice</b> for title, author and subject: If the value is null or "",
 * it is just ignored
 *
 * @param inFileName
 * @param inTitle
 * @param inSubject
 * @param inAuthor
 * @return
 * @throws IOException
 * @throws COSVisitorException
 *
 * @since 5.9.5
 */
public Blob merge(String inFileName, String inTitle, String inSubject, String inAuthor)
        throws IOException, COSVisitorException {

    Blob finalBlob = null;

    switch (blobs.size()) {
    case 0:
        finalBlob = null;
        break;

    case 1:
        finalBlob = blobs.get(0);
        break;

    default:
        PDFMergerUtility ut = new PDFMergerUtility();
        for (Blob b : blobs) {
            ut.addSource(b.getStream());
        }

        File tempFile = File.createTempFile("mergepdf", ".pdf");
        ut.setDestinationFileName(tempFile.getAbsolutePath());

        ut.mergeDocuments();

        if (inTitle != null || inAuthor != null || inSubject != null) {
            PDDocument finalDoc = PDDocument.load(tempFile);
            PDFUtils.setInfos(finalDoc, inTitle, inSubject, inAuthor);
            finalDoc.save(tempFile);
            finalDoc.close();
        }

        finalBlob = new FileBlob(tempFile);
        Framework.trackFile(tempFile, finalBlob);

        if (inFileName != null && !inFileName.isEmpty()) {
            finalBlob.setFilename(inFileName);
        } else {
            finalBlob.setFilename(blobs.get(0).getFilename());
        }
        finalBlob.setMimeType("application/pdf");
        break;

    }

    return finalBlob;
}