List of usage examples for org.apache.pdfbox.pdmodel PDResources getXObject
public PDXObject getXObject(COSName name) throws IOException
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method extracts all the embedded images of the pdf document */// w ww. ja v a2 s .c o m private List<String> extractimages(String file, int startPage, int endPage) { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); ArrayList<String> imgNames = new ArrayList<String>(); boolean bImageFound = false; try { this.createImageDestinationDirectory(file); String fileName = this.getFileName(file).replace(".pdf", "_resource"); PDDocument document = PDDocument.load(new File(file)); PDPageTree list = document.getPages(); this.updateStartAndEndPages(file, startPage, endPage); int totalImages = 1; for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) { logger.info("Page No : " + (iPage + 1)); PDResources pdResources = list.get(iPage).getResources(); for (COSName c : pdResources.getXObjectNames()) { PDXObject o = pdResources.getXObject(c); if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) { bImageFound = true; String fname = this.imageDestinationPath + "/" + fileName + "_" + totalImages + ".png"; ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", new File(fname)); imgNames.add(fname); totalImages++; } } } document.close(); if (bImageFound) logger.info("Images are saved @ " + this.imageDestinationPath); else logger.info("No images were found in the PDF"); } catch (Exception e) { e.printStackTrace(); } return imgNames; }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
private void extractImages(PDResources resources) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return;/*from w w w.ja v a 2 s . c o m*/ } Iterable<COSName> cosIterable = resources.getXObjectNames(); if (cosIterable == null) { return; } for (COSName name : cosIterable) { PDXObject object = resources.getXObject(name); if (object instanceof PDFormXObject) { extractImages(((PDFormXObject) object).getResources()); } else if (object instanceof PDImageXObject) { PDImageXObject image = (PDImageXObject) object; Metadata metadata = new Metadata(); String extension = ""; if ("jpg".equalsIgnoreCase(image.getSuffix())) { metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); extension = ".jpg"; } else if ("tiff".equalsIgnoreCase(image.getSuffix())) { metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = ".tif"; } else if ("jpx".equalsIgnoreCase(image.getSuffix())) { metadata.set(Metadata.CONTENT_TYPE, "image/jpx"); extension = ".jpx"; } else if ("png".equalsIgnoreCase(image.getSuffix())) { metadata.set(Metadata.CONTENT_TYPE, "image/png"); extension = ".png"; } Integer imageNumber = processedInlineImages.get(name.getName()); if (imageNumber == null) { imageNumber = inlineImageCounter++; } String fileName = "image" + imageNumber + extension; metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); handler.startElement("img", attr); handler.endElement("img"); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? if (config.getExtractUniqueInlineImagesOnly() == true) { String cosObjectId = name.getName(); if (processedInlineImages.containsKey(cosObjectId)) { continue; } processedInlineImages.put(cosObjectId, imageNumber); } metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); if (extractor.shouldParseEmbedded(metadata)) { try { extractor.parseEmbedded(image.getCOSStream().getFilteredStream(), new EmbeddedContentHandler(handler), metadata, false); } catch (IOException e) { // could not extract this image, so just skip it... } } } } }
From source file:org.apache.tika.parser.pdf.PDF2XHTMLPureJava.java
License:Apache License
private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return;/*from w w w . j a v a 2 s . c om*/ } for (COSName name : resources.getXObjectNames()) { PDXObject object = null; try { object = resources.getXObject(name); } catch (MissingImageReaderException e) { EmbeddedDocumentUtil.recordException(e, metadata); continue; } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); continue; } if (object == null) { continue; } COSStream cosStream = object.getCOSObject(); if (seenThisPage.contains(cosStream)) { //avoid infinite recursion TIKA-1742 continue; } seenThisPage.add(cosStream); if (object instanceof PDFormXObject) { extractImages(((PDFormXObject) object).getResources(), seenThisPage); } else if (object instanceof PDImageXObject) { PDImageXObject image = (PDImageXObject) object; Metadata embeddedMetadata = new Metadata(); String extension = image.getSuffix(); //TODO remove this next block when upgrading to PDFBox 2.0.5. //See: https://issues.apache.org/jira/browse/PDFBOX-3634 if (extension == null) { extension = getJBIG2Suffix(image); } if (extension == null || extension.equals("png")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png"); extension = "png"; } else if (extension.equals("jpg")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); } else if (extension.equals("tiff")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = "tif"; } else if (extension.equals("jpx")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2"); } else if (extension.equals("jb2")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2"); } else { //TODO: determine if we need to add more image types // throw new RuntimeException("EXTEN:" + extension); } Integer imageNumber = processedInlineImages.get(cosStream); if (imageNumber == null) { imageNumber = inlineImageCounter++; } String fileName = "image" + imageNumber + "." + extension; embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); xhtml.startElement("img", attr); xhtml.endElement("img"); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? if (config.getExtractUniqueInlineImagesOnly() == true) { if (processedInlineImages.containsKey(cosStream)) { continue; } processedInlineImages.put(cosStream, imageNumber); } embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); try { //TODO: handle image.getMetadata()? try { writeToBuffer(image, extension, buffer); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); continue; } try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) { embeddedDocumentExtractor.parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), embeddedMetadata, false); } } catch (IOException e) { handleCatchableIOE(e); } } } } }
From source file:org.example.extractimagesfrompdfpages.ExtractImagesFromPDFPagesMain.java
public static void main(String[] args) { try {// w ww . ja v a 2 s. c om File thePDFFile = new File(args[0]); PDDocument document = PDDocument.load(thePDFFile); PDPageTree list = document.getPages(); int i = 1; for (PDPage page : list) { Boolean alreadyCreatedFolderForThisPage = false; File thePDFFileDirectory = thePDFFile.getParentFile(); File thePDFPageFolder = new File(thePDFFileDirectory.getAbsolutePath() + "/temp_images" + "/" + i); PDResources pdResources = page.getResources(); int j = 1; for (COSName c : pdResources.getXObjectNames()) { PDXObject o = pdResources.getXObject(c); if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) { if (alreadyCreatedFolderForThisPage == false) { thePDFPageFolder.mkdirs(); alreadyCreatedFolderForThisPage = true; } File file = new File(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png"); ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", file); System.out.println(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png"); j++; } } i++; } } catch (IOException ex) { Logger.getLogger(ExtractImagesFromPDFPagesMain.class.getName()).log(Level.SEVERE, null, ex); throw new RuntimeException(ex); } }
From source file:org.fit.pdfdom.PDFBoxTree.java
License:Open Source License
private void processFontResources(PDResources resources, FontTable table) throws IOException { String fontNotSupportedMessage = "Font: {} skipped because type '{}' is not supported."; for (COSName key : resources.getFontNames()) { PDFont font = resources.getFont(key); if (font instanceof PDTrueTypeFont) { table.addEntry(font.getName(), font.getFontDescriptor()); log.debug("Font: " + font.getName() + " TTF"); } else if (font instanceof PDType0Font) { PDCIDFont descendantFont = ((PDType0Font) font).getDescendantFont(); if (descendantFont instanceof PDCIDFontType2) table.addEntry(font.getName(), descendantFont.getFontDescriptor()); else//w w w . j a v a2s .c o m log.warn(fontNotSupportedMessage, font.getName(), font.getClass().getSimpleName()); } else if (font instanceof PDType1CFont) table.addEntry(font.getName(), font.getFontDescriptor()); else log.warn(fontNotSupportedMessage, font.getName(), font.getClass().getSimpleName()); } for (COSName name : resources.getXObjectNames()) { PDXObject xobject = resources.getXObject(name); if (xobject instanceof PDFormXObject) { PDFormXObject xObjectForm = (PDFormXObject) xobject; PDResources formResources = xObjectForm.getResources(); if (formResources != null) processFontResources(formResources, table); } } }
From source file:org.mabb.fontverter.pdf.PdfFontExtractor.java
License:Open Source License
private void extractFontResources(PDResources resources) throws IOException { for (COSName key : resources.getFontNames()) { PDFont font = resources.getFont(key); extractStrategy.extract(font);/*ww w. ja v a 2 s .c om*/ } for (COSName name : resources.getXObjectNames()) { PDXObject xobject = resources.getXObject(name); if (xobject instanceof PDFormXObject) { PDFormXObject xObjectForm = (PDFormXObject) xobject; PDResources formResources = xObjectForm.getResources(); if (formResources != null) extractFontResources(formResources); } } }
From source file:org.xwiki.test.misc.PDFTest.java
License:Open Source License
private Map<String, PDImageXObject> getImages(URL url) throws Exception { Map<String, PDImageXObject> results = new HashMap<>(); PDDocument document = PDDocument.load(IOUtils.toByteArray(url)); try {/* w ww. j av a2 s . c o m*/ for (PDPage page : document.getDocumentCatalog().getPages()) { PDResources pdResources = page.getResources(); for (COSName name : pdResources.getXObjectNames()) { if (pdResources.isImageXObject(name)) { PDImageXObject pdxObjectImage = (PDImageXObject) pdResources.getXObject(name); results.put(name.getName(), pdxObjectImage); } } } } finally { if (document != null) { document.close(); } } return results; }