List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:org.mycore.iview2.frontend.MCRPDFTools.java
License:Open Source License
static BufferedImage getThumbnail(Path pdfFile, int thumbnailSize, boolean centered) throws IOException { InputStream fileIS = Files.newInputStream(pdfFile); PDDocument pdf = PDDocument.load(fileIS); try {// w ww .j a va 2 s .co m PDFRenderer pdfRenderer = new PDFRenderer(pdf); BufferedImage level1Image = pdfRenderer.renderImage(0); int imageType = BufferedImage.TYPE_INT_ARGB; if (!centered) { return level1Image; } final double width = level1Image.getWidth(); final double height = level1Image.getHeight(); LOGGER.info("new PDFBox: " + width + "x" + height); LOGGER.info("temporary image dimensions: " + width + "x" + height); final int newWidth = width < height ? (int) Math.ceil(thumbnailSize * width / height) : thumbnailSize; final int newHeight = width < height ? thumbnailSize : (int) Math.ceil(thumbnailSize * height / width); //if centered make thumbnailSize x thumbnailSize image final BufferedImage bicubic = new BufferedImage(centered ? thumbnailSize : newWidth, centered ? thumbnailSize : newHeight, imageType); LOGGER.info("target image dimensions: " + bicubic.getWidth() + "x" + bicubic.getHeight()); final Graphics2D bg = bicubic.createGraphics(); bg.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC); int x = centered ? (thumbnailSize - newWidth) / 2 : 0; int y = centered ? (thumbnailSize - newHeight) / 2 : 0; if (x != 0 && y != 0) { LOGGER.warn("Writing at position " + x + "," + y); } bg.drawImage(level1Image, x, y, x + newWidth, y + newHeight, 0, 0, (int) Math.ceil(width), (int) Math.ceil(height), null); bg.dispose(); return bicubic; } finally { pdf.close(); } }
From source file:org.mycore.media.MCRMediaPDFParser.java
License:Open Source License
/** * Parse file and store metadata in related Object. * // w ww. j a va 2 s . com * @return MCRMediaObject * can be held any MCRMediaObject * @see MCRMediaObject#clone() */ @SuppressWarnings("unchecked") public synchronized MCRMediaObject parse(File file) throws Exception { if (!file.exists()) throw new IOException("File \"" + file.getName() + "\" doesn't exists!"); MCRPDFObject media = new MCRPDFObject(); LOGGER.info("parse " + file.getName() + "..."); PDDocument pdf = PDDocument.load(file); try { media.fileName = file.getName(); media.fileSize = file.length(); media.folderName = (file.getAbsolutePath()).replace(file.getName(), ""); PDPageTree pages = pdf.getDocumentCatalog().getPages(); media.numPages = pdf.getNumberOfPages(); PDPage page = (PDPage) pages.get(0); PDRectangle rect = page.getMediaBox(); media.width = Math.round(rect.getWidth()); media.height = Math.round(rect.getHeight()); PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { media.tags = new MCRMediaTagObject(); media.tags.author = info.getAuthor(); media.tags.creator = info.getCreator(); media.tags.producer = info.getProducer(); media.tags.title = info.getTitle(); media.tags.subject = info.getSubject(); media.tags.keywords = info.getKeywords(); } } catch (Exception e) { LOGGER.error(e.getMessage()); throw new Exception(e.getMessage()); } finally { pdf.close(); } return media; }
From source file:org.nines.NinesStatementHandlerTest.java
License:Apache License
@Test public void testPdfStrip() { try {// w w w. ja v a2s . co m FileInputStream is = new FileInputStream(new File("test_data/sample.pdf")); PDDocument pdfDoc = PDDocument.load(is); assertEquals(2, pdfDoc.getNumberOfPages()); PDFTextStripper pdfStrip = new PDFTextStripper(); String text = pdfStrip.getText(pdfDoc); assertNotNull(text); System.out.println(text); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.nines.RdfTextSpider.java
License:Apache License
/** * Extract the text from the PDF specified by the URI * @param uri//ww w. j a va 2 s .c om * @return * @throws IOException */ private byte[] scrapeExternalPDF(final String uri) throws IOException { InputStream is = null; GetMethod get = new GetMethod(uri); ; PDDocument pdfDoc = null; try { int result; result = httpClient.executeMethod(get); if (result != 200) { throw new IOException(result + " code returned for URL: " + uri); } is = get.getResponseBodyAsStream(); pdfDoc = PDDocument.load(is); PDFTextStripper pdfStrip = new PDFTextStripper(); return pdfStrip.getText(pdfDoc).getBytes(); } catch (IOException e) { throw e; // just rethrow it } finally { try { get.releaseConnection(); IOUtils.closeQuietly(is); if (pdfDoc != null) { pdfDoc.close(); } } catch (Exception e) { } } }
From source file:org.nuxeo.ecm.core.convert.plugins.tests.PDFEncodingWarn.java
License:Apache License
@Test public void extract() throws IOException { URL url = getClass().getResource("/test-docs/nutcracker.pdf"); PDDocument doc = PDDocument.load(url); PDFTextStripper stripper = new PDF2TextConverter.PatchedPDFTextStripper(); stripper.getText(doc);/*from www . j a v a2 s .c o m*/ stripper.getText(doc); }
From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java
License:Apache License
@Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { PDDocument document = null;/*from w w w . ja v a 2 s . co m*/ File f = null; OutputStream fas = null; try { document = PDDocument.load(blobHolder.getBlob().getStream()); // NXP-1556: if document is protected an IOException will be raised // Instead of catching the exception based on its message string // lets avoid sending messages that will generate this error // code taken from PDFTextStripper.writeText source. // only care about standard encryption and if it was decrypted with // the user password AccessPermission permission = document.getCurrentAccessPermission(); if (permission.canExtractContent()) { PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); // use the position information to heuristically organize the // extracted paragraphs. This is also important for // right-to-left languages. textStripper.setSortByPosition(true); String text = textStripper.getText(document); // replace non breaking space by regular spaces (why?) // text = text.replace("\u00a0", " "); f = Framework.createTempFile("pdfboplugin", ".txt"); fas = new FileOutputStream(f); fas.write(text.getBytes("UTF-8")); try (FileInputStream is = new FileInputStream(f)) { Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); return new SimpleCachableBlobHolder(blob); } } else { return new SimpleCachableBlobHolder(Blobs.createBlob("")); } } catch (IOException e) { throw new ConversionException("Error during text extraction with PDFBox", e); } finally { if (document != null) { try { document.close(); } catch (IOException e) { log.error("Error while closing PDFBox document", e); } } if (fas != null) { try { fas.close(); } catch (IOException e) { log.error(e); } } if (f != null) { f.delete(); } } }
From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java
License:Apache License
public static String readPdfText(File pdfFile) throws IOException { PDFTextStripper textStripper = new PDFTextStripper(); PDDocument document = PDDocument.load(pdfFile); String text = textStripper.getText(document); document.close();// w w w . j av a 2s . com return text.trim(); }
From source file:org.nuxeo.ecm.platform.convert.tests.BaseConverterTest.java
License:Apache License
public static boolean isPDFA(File pdfFile) throws Exception { PDDocument pddoc = PDDocument.load(pdfFile); XMPMetadata xmp = pddoc.getDocumentCatalog().getMetadata().exportXMPMetadata(); Document doc = xmp.getXMPDocument(); // <rdf:Description xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/" // rdf:about=""> // <pdfaid:part>1</pdfaid:part> // <pdfaid:conformance>A</pdfaid:conformance> // </rdf:Description> NodeList list = doc.getElementsByTagName("pdfaid:conformance"); return list != null && "A".equals(list.item(0).getTextContent()); }
From source file:org.nuxeo.ecm.platform.convert.tests.DocumentUTUtils.java
License:Open Source License
/** * Extracts the text from a PDF file./*from w ww . j a va2s. c om*/ * * @return the document content as plain text */ public static String readPdfText(File pdfFile) throws IOException { PDFTextStripper textStripper = new PDFTextStripper(); PDDocument document = PDDocument.load(pdfFile); String text = textStripper.getText(document); document.close(); return text.trim(); }
From source file:org.nuxeo.ecm.platform.picture.core.test.TestMagickExecutors.java
License:Apache License
@Test public void testConverterToPDF() throws Exception { File file = FileUtils.getResourceFileFromContext("images/test.jpg"); File out = Framework.createTempFile(TMP_FILE_PREFIX, ".document.pdf"); ImageConverter.convert(file.getAbsolutePath(), out.getAbsolutePath()); assertEquals("pdf", FilenameUtils.getExtension(out.getAbsolutePath())); PDDocument doc = PDDocument.load(out); assertNotNull(doc);/*from w w w.j a v a 2 s.c om*/ out.delete(); }