List of usage examples for org.apache.pdfbox.pdmodel.graphics PDXObject getCOSObject
@Override public final COSStream getCOSObject()
From source file:org.apache.fop.render.pdf.pdfbox.PageParentTreeFinder.java
License:Apache License
private int findXObjectStructParent() throws IOException { int position = -1; Iterable<COSName> mapXObject = srcPage.getResources().getXObjectNames(); for (COSName n : mapXObject) { PDXObject t = srcPage.getResources().getXObject(n); COSDictionary xObjectDict = (COSDictionary) t.getCOSObject(); position = xObjectDict.getInt(COSName.STRUCT_PARENTS); if (position != -1) { return position; }//from w w w. j av a2 s .c om } return position; }
From source file:org.apache.tika.parser.pdf.PDF2XHTMLPureJava.java
License:Apache License
private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return;/* w w w. j a va2s . c om*/ } for (COSName name : resources.getXObjectNames()) { PDXObject object = null; try { object = resources.getXObject(name); } catch (MissingImageReaderException e) { EmbeddedDocumentUtil.recordException(e, metadata); continue; } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); continue; } if (object == null) { continue; } COSStream cosStream = object.getCOSObject(); if (seenThisPage.contains(cosStream)) { //avoid infinite recursion TIKA-1742 continue; } seenThisPage.add(cosStream); if (object instanceof PDFormXObject) { extractImages(((PDFormXObject) object).getResources(), seenThisPage); } else if (object instanceof PDImageXObject) { PDImageXObject image = (PDImageXObject) object; Metadata embeddedMetadata = new Metadata(); String extension = image.getSuffix(); //TODO remove this next block when upgrading to PDFBox 2.0.5. //See: https://issues.apache.org/jira/browse/PDFBOX-3634 if (extension == null) { extension = getJBIG2Suffix(image); } if (extension == null || extension.equals("png")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png"); extension = "png"; } else if (extension.equals("jpg")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); } else if (extension.equals("tiff")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = "tif"; } else if (extension.equals("jpx")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2"); } else if (extension.equals("jb2")) { embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2"); } else { //TODO: determine if we need to add more image types // throw new RuntimeException("EXTEN:" + extension); } Integer imageNumber = processedInlineImages.get(cosStream); if (imageNumber == null) { imageNumber = inlineImageCounter++; } String fileName = "image" + imageNumber + "." + extension; embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); xhtml.startElement("img", attr); xhtml.endElement("img"); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? if (config.getExtractUniqueInlineImagesOnly() == true) { if (processedInlineImages.containsKey(cosStream)) { continue; } processedInlineImages.put(cosStream, imageNumber); } embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); try { //TODO: handle image.getMetadata()? try { writeToBuffer(image, extension, buffer); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); continue; } try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) { embeddedDocumentExtractor.parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), embeddedMetadata, false); } } catch (IOException e) { handleCatchableIOE(e); } } } } }