List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java
License:Open Source License
@Test public void givenPdfWith2Fonts_extractFontsToDir_thenDirectoryHasThreeTtfFiles() throws IOException { PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf")); PdfFontExtractor extractor = new PdfFontExtractor(); File extractDir = folder.getRoot(); extractor.extractFontsToDir(doc, extractDir); File[] fontFiles = extractDir.listFiles(); Assert.assertEquals(3, fontFiles.length); for (File fileOn : fontFiles) Assert.assertEquals("ttf", FilenameUtils.getExtension(fileOn.getPath())); doc.close(); }
From source file:org.mabb.fontverter.pdf.TestPdfFontExtractor.java
License:Open Source License
@Test public void givenPdfWith2Fonts_extractFontsToDirWithWoff1FormatSet_thenDirectoryHasThreeWoffFiles() throws IOException { File extractDir = folder.getRoot(); PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf")); PdfFontExtractor extractor = new PdfFontExtractor(); extractor.setExtractFormat(FontVerter.FontFormat.WOFF1); extractor.extractFontsToDir(doc, extractDir); File[] fontFiles = extractDir.listFiles(); Assert.assertEquals(3, fontFiles.length); for (File fileOn : fontFiles) Assert.assertEquals("woff", FilenameUtils.getExtension(fileOn.getPath())); doc.close(); }
From source file:org.mitre.xtext.converters.PDFConverter.java
License:Apache License
/** Implementation is informed by PDFBox authors. *///w w w . j a v a 2s .c o m @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes with * the lucene project. This class will populate the following fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF * meta-data if available</td> </tr> </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on /** * * Exception in thread "main" java.lang.NoClassDefFoundError: * org/bouncycastle/jce/provider/BouncyCastleProvider at * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090) * at * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594) * * CRYPTO stuff -- load BouncyCastle crypto JAR files. try { * pdfDocument.decrypt(""); } catch (CryptographyException e) { * throw new IOException("Error decrypting document(" + pdf_file * + "): " + e); } catch (InvalidPasswordException e) { //they * didn't suppply a password and the default of "" was wrong. * throw new IOException( "Error: The document(" + pdf_file + ") * is encrypted "); } finally { if (pdfDocument != null) { * pdfDocument.close();} } */ textdoc.addProperty("encrypted", "YES"); } else { //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setPayload(writer.getBuffer().toString()); } return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.mycore.iview2.frontend.MCRPDFTools.java
License:Open Source License
static BufferedImage getThumbnail(Path pdfFile, int thumbnailSize, boolean centered) throws IOException { InputStream fileIS = Files.newInputStream(pdfFile); PDDocument pdf = PDDocument.load(fileIS); try {/*from w ww .ja v a 2 s . c o m*/ PDFRenderer pdfRenderer = new PDFRenderer(pdf); BufferedImage level1Image = pdfRenderer.renderImage(0); int imageType = BufferedImage.TYPE_INT_ARGB; if (!centered) { return level1Image; } final double width = level1Image.getWidth(); final double height = level1Image.getHeight(); LOGGER.info("new PDFBox: " + width + "x" + height); LOGGER.info("temporary image dimensions: " + width + "x" + height); final int newWidth = width < height ? (int) Math.ceil(thumbnailSize * width / height) : thumbnailSize; final int newHeight = width < height ? thumbnailSize : (int) Math.ceil(thumbnailSize * height / width); //if centered make thumbnailSize x thumbnailSize image final BufferedImage bicubic = new BufferedImage(centered ? thumbnailSize : newWidth, centered ? thumbnailSize : newHeight, imageType); LOGGER.info("target image dimensions: " + bicubic.getWidth() + "x" + bicubic.getHeight()); final Graphics2D bg = bicubic.createGraphics(); bg.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC); int x = centered ? (thumbnailSize - newWidth) / 2 : 0; int y = centered ? (thumbnailSize - newHeight) / 2 : 0; if (x != 0 && y != 0) { LOGGER.warn("Writing at position " + x + "," + y); } bg.drawImage(level1Image, x, y, x + newWidth, y + newHeight, 0, 0, (int) Math.ceil(width), (int) Math.ceil(height), null); bg.dispose(); return bicubic; } finally { pdf.close(); } }
From source file:org.mycore.media.MCRMediaPDFParser.java
License:Open Source License
/** * Parse file and store metadata in related Object. * //from www . ja v a 2s . c om * @return MCRMediaObject * can be held any MCRMediaObject * @see MCRMediaObject#clone() */ @SuppressWarnings("unchecked") public synchronized MCRMediaObject parse(File file) throws Exception { if (!file.exists()) throw new IOException("File \"" + file.getName() + "\" doesn't exists!"); MCRPDFObject media = new MCRPDFObject(); LOGGER.info("parse " + file.getName() + "..."); PDDocument pdf = PDDocument.load(file); try { media.fileName = file.getName(); media.fileSize = file.length(); media.folderName = (file.getAbsolutePath()).replace(file.getName(), ""); PDPageTree pages = pdf.getDocumentCatalog().getPages(); media.numPages = pdf.getNumberOfPages(); PDPage page = (PDPage) pages.get(0); PDRectangle rect = page.getMediaBox(); media.width = Math.round(rect.getWidth()); media.height = Math.round(rect.getHeight()); PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { media.tags = new MCRMediaTagObject(); media.tags.author = info.getAuthor(); media.tags.creator = info.getCreator(); media.tags.producer = info.getProducer(); media.tags.title = info.getTitle(); media.tags.subject = info.getSubject(); media.tags.keywords = info.getKeywords(); } } catch (Exception e) { LOGGER.error(e.getMessage()); throw new Exception(e.getMessage()); } finally { pdf.close(); } return media; }
From source file:org.nines.RdfTextSpider.java
License:Apache License
/** * Extract the text from the PDF specified by the URI * @param uri// w w w . ja v a 2s .c o m * @return * @throws IOException */ private byte[] scrapeExternalPDF(final String uri) throws IOException { InputStream is = null; GetMethod get = new GetMethod(uri); ; PDDocument pdfDoc = null; try { int result; result = httpClient.executeMethod(get); if (result != 200) { throw new IOException(result + " code returned for URL: " + uri); } is = get.getResponseBodyAsStream(); pdfDoc = PDDocument.load(is); PDFTextStripper pdfStrip = new PDFTextStripper(); return pdfStrip.getText(pdfDoc).getBytes(); } catch (IOException e) { throw e; // just rethrow it } finally { try { get.releaseConnection(); IOUtils.closeQuietly(is); if (pdfDoc != null) { pdfDoc.close(); } } catch (Exception e) { } } }
From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java
License:Apache License
@Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { PDDocument document = null; File f = null;/*from w w w .ja va2s . co m*/ OutputStream fas = null; try { document = PDDocument.load(blobHolder.getBlob().getStream()); // NXP-1556: if document is protected an IOException will be raised // Instead of catching the exception based on its message string // lets avoid sending messages that will generate this error // code taken from PDFTextStripper.writeText source. // only care about standard encryption and if it was decrypted with // the user password AccessPermission permission = document.getCurrentAccessPermission(); if (permission.canExtractContent()) { PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); // use the position information to heuristically organize the // extracted paragraphs. This is also important for // right-to-left languages. textStripper.setSortByPosition(true); String text = textStripper.getText(document); // replace non breaking space by regular spaces (why?) // text = text.replace("\u00a0", " "); f = Framework.createTempFile("pdfboplugin", ".txt"); fas = new FileOutputStream(f); fas.write(text.getBytes("UTF-8")); try (FileInputStream is = new FileInputStream(f)) { Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); return new SimpleCachableBlobHolder(blob); } } else { return new SimpleCachableBlobHolder(Blobs.createBlob("")); } } catch (IOException e) { throw new ConversionException("Error during text extraction with PDFBox", e); } finally { if (document != null) { try { document.close(); } catch (IOException e) { log.error("Error while closing PDFBox document", e); } } if (fas != null) { try { fas.close(); } catch (IOException e) { log.error(e); } } if (f != null) { f.delete(); } } }
From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java
License:Apache License
public static String readPdfText(File pdfFile) throws IOException { PDFTextStripper textStripper = new PDFTextStripper(); PDDocument document = PDDocument.load(pdfFile); String text = textStripper.getText(document); document.close(); return text.trim(); }
From source file:org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java
License:Open Source License
/** * Extracts the text from a PDF file.//w ww . jav a 2 s .c om * * @return the document content as plain text */ public static String readPdfText(File pdfFile) throws IOException { PDFTextStripper textStripper = new PDFTextStripper(); PDDocument document = PDDocument.load(pdfFile); String text = textStripper.getText(document); document.close(); return text.trim(); }
From source file:org.nuxeo.pdf.PDFMerge.java
License:Open Source License
/** * Merge the PDFs. optionnaly, can set the title, subject and author of the * resulting PDF.//from ww w . j a v a 2 s . c om * <p> * <b>Notice</b> for title, author and subject: If the value is null or "", * it is just ignored * * @param inFileName * @param inTitle * @param inSubject * @param inAuthor * @return * @throws IOException * @throws COSVisitorException * * @since 5.9.5 */ public Blob merge(String inFileName, String inTitle, String inSubject, String inAuthor) throws IOException, COSVisitorException { Blob finalBlob = null; switch (blobs.size()) { case 0: finalBlob = null; break; case 1: finalBlob = blobs.get(0); break; default: PDFMergerUtility ut = new PDFMergerUtility(); for (Blob b : blobs) { ut.addSource(b.getStream()); } File tempFile = File.createTempFile("mergepdf", ".pdf"); ut.setDestinationFileName(tempFile.getAbsolutePath()); ut.mergeDocuments(); if (inTitle != null || inAuthor != null || inSubject != null) { PDDocument finalDoc = PDDocument.load(tempFile); PDFUtils.setInfos(finalDoc, inTitle, inSubject, inAuthor); finalDoc.save(tempFile); finalDoc.close(); } finalBlob = new FileBlob(tempFile); Framework.trackFile(tempFile, finalBlob); if (inFileName != null && !inFileName.isEmpty()) { finalBlob.setFilename(inFileName); } else { finalBlob.setFilename(blobs.get(0).getFilename()); } finalBlob.setMimeType("application/pdf"); break; } return finalBlob; }