List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog
public PDDocumentCatalog getDocumentCatalog()
From source file:org.ala.harvester.ExtractPubfSciNamesAndImages.java
License:Apache License
private static void extractSciNameAndImages(PDDocument document) throws IOException { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true);// ww w . j a v a2 s . c o m Rectangle rect = new Rectangle(10, 60, 275, 20); stripper.addRegion("class1", rect); List allPages = document.getDocumentCatalog().getAllPages(); Writer writer = getSiteMapWriter("anic"); writeColumnHeaders(writer); for (int pageNum = 37; pageNum <= 249; pageNum++) { // for (int pageNum = 156; pageNum <= 156; pageNum++) { PDPage page = (PDPage) allPages.get(pageNum); PDResources resources = page.getResources(); Map images = resources.getImages(); stripper.extractRegions(page); String sciName = stripper.getTextForRegion("class1").trim(); System.out.println("Scientific Name: " + sciName); if (images != null) { Iterator imageIter = images.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) images.get(key); String name = null; if ("jpg".equals(image.getSuffix())) { name = getUniqueFileName(sciName + "_" + key, image.getSuffix()); System.out.println("Writing image:" + name); image.write2file("/data/tmp/" + name); writer.write(sciName); writer.write(","); writer.write(name + "." + image.getSuffix()); writer.write("\n"); } } } } }
From source file:org.apache.fop.render.pdf.DocumentRootModifierTestCase.java
License:Apache License
@Test public void testStructTreeRootEntriesToCopy() throws IOException { Rectangle2D r = new Rectangle2D.Double(); PDFDocument pdfDoc = new PDFDocument(""); PDFPage page = new PDFPage(new PDFResources(pdfDoc), 0, r, r, r, r); page.setObjectNumber(1);//from w ww . j av a2 s. c o m page.setDocument(pdfDoc); pdfDoc.makeStructTreeRoot(null); PDFStructTreeRoot structTreeRoot = pdfDoc.getRoot().getStructTreeRoot(); PDFDictionary rootBaseRoleMap = new PDFDictionary(); PDFBoxAdapter adapter = new PDFBoxAdapter(page, new HashMap(), new HashMap<Integer, PDFArray>()); DocumentRootModifier modifier = new DocumentRootModifier(adapter, pdfDoc); COSDictionary root = new COSDictionary(); COSDictionary mapRole = new COSDictionary(); mapRole.setName("Icon", "Figure"); root.setItem(COSName.ROLE_MAP, mapRole); modifier.structTreeRootEntriesToCopy(root); structTreeRoot = pdfDoc.getRoot().getStructTreeRoot(); PDFDictionary baseRoot = (PDFDictionary) structTreeRoot.get("RoleMap"); String test = baseRoot.get("Icon").toString(); String expected = "/Figure"; Assert.assertEquals(test, expected); PDFName para = new PDFName("P"); rootBaseRoleMap.put("MyPara", para); structTreeRoot.put("RoleMap", rootBaseRoleMap); modifier.structTreeRootEntriesToCopy(root); structTreeRoot = pdfDoc.getRoot().getStructTreeRoot(); PDFDictionary baseRoot2 = (PDFDictionary) structTreeRoot.get("RoleMap"); PDFName nameIcon = (PDFName) baseRoot2.get("Icon"); PDFName myPara = (PDFName) baseRoot2.get("MyPara"); test = nameIcon.getName(); expected = "Figure"; Assert.assertEquals(test, expected); test = myPara.getName(); expected = "P"; Assert.assertEquals(test, expected); PDDocument doc = PDDocument.load(new File(getClass().getResource(CLASSMAP).getFile())); COSDictionary temp = (COSDictionary) doc.getDocumentCatalog().getStructureTreeRoot().getCOSObject(); PDFDictionary classMap = new PDFDictionary(); PDFDictionary inner = new PDFDictionary(); inner.put("StartIndent", 0); classMap.put("Normal2", inner); structTreeRoot.put("ClassMap", classMap); modifier.structTreeRootEntriesToCopy(temp); structTreeRoot = pdfDoc.getRoot().getStructTreeRoot(); PDFDictionary testDict = (PDFDictionary) structTreeRoot.get("ClassMap"); Assert.assertNotNull(testDict.get("Normal2")); }
From source file:org.apache.fop.render.pdf.pdfbox.AbstractPDFBoxHandler.java
License:Apache License
protected String createStreamForPDF(ImagePDF image, PDFPage targetPage, FOUserAgent userAgent, AffineTransform at, FontInfo fontinfo, Rectangle pos, Map<Integer, PDFArray> pageNumbers, PDFLogicalStructureHandler handler, PDFStructElem curentSessionElem) throws IOException { EventBroadcaster eventBroadcaster = null; if (userAgent != null) { eventBroadcaster = userAgent.getEventBroadcaster(); }//from w w w . j av a 2 s .co m String originalImageUri = image.getInfo().getOriginalURI(); final int selectedPage = ImageUtil.needPageIndexFromURI(originalImageUri); PDDocument pddoc = image.getPDDocument(); float pdfVersion = pddoc.getDocument().getVersion(); Version inputDocVersion = Version.getValueOf(String.valueOf(pdfVersion)); PDFDocument pdfDoc = targetPage.getDocument(); if (pdfDoc.getPDFVersion().compareTo(inputDocVersion) < 0) { try { pdfDoc.setPDFVersion(inputDocVersion); } catch (IllegalStateException e) { getEventProducer(eventBroadcaster).pdfVersionMismatch(this, pdfDoc.getPDFVersionString(), String.valueOf(pdfVersion)); } } //Encryption test if (pddoc.isEncrypted()) { getEventProducer(eventBroadcaster).encryptedPdf(this); return null; } //Warn about potential problems with PDF/A and PDF/X if (pdfDoc.getProfile().isPDFAActive()) { getEventProducer(eventBroadcaster).pdfAActive(this); } if (pdfDoc.getProfile().isPDFXActive()) { getEventProducer(eventBroadcaster).pdfXActive(this); } Map<Object, Object> objectCache = getObjectCache(originalImageUri, userAgent); PDPage page = pddoc.getDocumentCatalog().getPages().get(selectedPage); if (targetPage.getPDFResources().getParentResources() == null) { PDFResources res = pdfDoc.getFactory().makeResources(); res.setParentResources(pdfDoc.getResources()); res.addContext(targetPage); targetPage.put("Resources", res); } PDFBoxAdapter adapter = new PDFBoxAdapter(targetPage, objectCache, pageNumbers); if (handler != null) { adapter.setCurrentMCID(handler.getPageParentTree().length()); } String stream = adapter.createStreamFromPDFBoxPage(pddoc, page, originalImageUri, at, fontinfo, pos); if (userAgent.isAccessibilityEnabled()) { TaggedPDFConductor conductor = new TaggedPDFConductor(curentSessionElem, handler, page, adapter); conductor.handleLogicalStructure(pddoc); } return stream; }
From source file:org.apache.fop.render.pdf.pdfbox.PageParentTreeFinder.java
License:Apache License
public COSArray getPageParentTreeArray(PDDocument srcDoc) throws IOException { int position = srcPage.getCOSObject().getInt(COSName.STRUCT_PARENTS); if (position == -1) { position = findXObjectStructParent(); }/*w w w . j a v a 2s . c om*/ if (position != -1) { PDNumberTreeNode srcNumberTreeNode = srcDoc.getDocumentCatalog().getStructureTreeRoot().getParentTree(); return traverseParentTree(srcNumberTreeNode.getCOSObject(), position); } return new COSArray(); }
From source file:org.apache.fop.render.pdf.pdfbox.PDFBoxAdapter.java
License:Apache License
private void handleAnnotations(PDDocument sourceDoc, PDPage page, AffineTransform at) throws IOException { PDDocumentCatalog srcCatalog = sourceDoc.getDocumentCatalog(); PDAcroForm srcAcroForm = srcCatalog.getAcroForm(); List pageAnnotations = page.getAnnotations(); if (srcAcroForm == null && pageAnnotations.isEmpty()) { return;/*from w w w. j a va 2s .c o m*/ } moveAnnotations(page, pageAnnotations, at); //Pseudo-cache the target page in place of the original source page. //This essentially replaces the original page reference with the target page. COSObject cosPage = null; COSDictionary parentDic = (COSDictionary) page.getCOSObject().getDictionaryObject(COSName.PARENT, COSName.P); COSArray kids = (COSArray) parentDic.getDictionaryObject(COSName.KIDS); for (int i = 0; i < kids.size(); i++) { //Hopefully safe to cast, as kids need to be indirect objects COSObject kid = (COSObject) kids.get(i); if (!pageNumbers.containsKey(i)) { PDFArray a = new PDFArray(); a.add(null); pdfDoc.assignObjectNumber(a); pdfDoc.addTrailerObject(a); pageNumbers.put(i, a); } cacheClonedObject(kid, pageNumbers.get(i)); if (kid.getObject() == page.getCOSObject()) { cosPage = kid; } } if (cosPage == null) { throw new IOException("Illegal PDF. Page not part of parent page node."); } Set<COSObject> fields = copyAnnotations(page); boolean formAlreadyCopied = getCachedClone(srcAcroForm) != null; PDFRoot catalog = this.pdfDoc.getRoot(); PDFDictionary destAcroForm = (PDFDictionary) catalog.get(COSName.ACRO_FORM.getName()); if (formAlreadyCopied) { //skip, already copied } else if (destAcroForm == null) { if (srcAcroForm != null) { //With this, only the first PDF's AcroForm is copied over. If later AcroForms have //different properties besides the actual fields, these get lost. Only fields //get merged. Collection exclude = Collections.singletonList(COSName.FIELDS); destAcroForm = (PDFDictionary) cloneForNewDocument(srcAcroForm, srcAcroForm, exclude); } else { //Work-around for incorrectly split PDFs which lack an AcroForm but have widgets //on pages. This doesn't handle the case where field dicts have "C" entries //(for the "CO" entry), so this may produce problems, but we have almost no chance //to guess the calculation order. destAcroForm = new PDFDictionary(pdfDoc.getRoot()); } pdfDoc.registerObject(destAcroForm); catalog.put(COSName.ACRO_FORM.getName(), destAcroForm); } PDFArray clonedFields = (PDFArray) destAcroForm.get(COSName.FIELDS.getName()); if (clonedFields == null) { clonedFields = new PDFArray(); destAcroForm.put(COSName.FIELDS.getName(), clonedFields); } for (COSObject field : fields) { PDFDictionary clone = (PDFDictionary) cloneForNewDocument(field, field, Arrays.asList(COSName.KIDS)); clonedFields.add(clone); } }
From source file:org.apache.fop.render.pdf.pdfbox.PreloaderPDF.java
License:Apache License
private ImageInfo loadPDF(String uri, Source src, ImageContext context) throws IOException, ImageException { int selectedPage = ImageUtil.needPageIndexFromURI(uri); URI docURI = deriveDocumentURI(src.getSystemId()); PDDocument pddoc = getDocument(context, docURI, src); pddoc = Interceptors.getInstance().interceptOnLoad(pddoc, docURI); //Disable the warning about a missing close since we rely on the GC to decide when //the cached PDF shall be disposed off. pddoc.getDocument().setWarnMissingClose(false); int pageCount = pddoc.getNumberOfPages(); if (selectedPage < 0 || selectedPage >= pageCount) { throw new ImageException("Selected page (index: " + selectedPage + ") does not exist in the PDF file. The document has " + pddoc.getNumberOfPages() + " pages."); }// w ww . j a v a 2 s . c o m PDPage page = pddoc.getDocumentCatalog().getPages().get(selectedPage); PDRectangle mediaBox = page.getMediaBox(); PDRectangle cropBox = page.getCropBox(); PDRectangle viewBox = cropBox != null ? cropBox : mediaBox; int w = Math.round(viewBox.getWidth() * 1000); int h = Math.round(viewBox.getHeight() * 1000); //Handle the /Rotation entry on the page dict int rotation = PDFUtil.getNormalizedRotation(page); if (rotation == 90 || rotation == 270) { //Swap width and height int exch = w; w = h; h = exch; } ImageSize size = new ImageSize(); size.setSizeInMillipoints(w, h); size.setResolution(context.getSourceResolution()); size.calcPixelsFromSize(); ImageInfo info = new ImageInfo(uri, ImagePDF.MIME_PDF); info.setSize(size); info.getCustomObjects().put(ImageInfo.ORIGINAL_IMAGE, new ImagePDF(info, pddoc)); int lastPageIndex = pddoc.getNumberOfPages() - 1; if (selectedPage < lastPageIndex) { info.getCustomObjects().put(ImageInfo.HAS_MORE_IMAGES, Boolean.TRUE); } return info; }
From source file:org.apache.fop.render.pdf.pdfbox.TaggedPDFConductor.java
License:Apache License
public void handleLogicalStructure(PDDocument srcDoc) throws IOException { if (isInputPDFTagged(srcDoc) && isStructureTreeRootNull(srcDoc)) { merger.setCurrentSessionElem();/*from ww w. j a va 2 s . c o m*/ COSDictionary strucRootDict = srcDoc.getDocumentCatalog().getStructureTreeRoot().getCOSObject(); rootMod.structTreeRootEntriesToCopy(strucRootDict); if (!isParentTreeIsPresent(strucRootDict)) { merger.createDirectDescendants(strucRootDict, merger.currentSessionElem); } else { PageParentTreeFinder markedContentsParentFinder = new PageParentTreeFinder(srcPage); COSArray markedContentsParents = markedContentsParentFinder.getPageParentTreeArray(srcDoc); COSDictionary roleMap = (COSDictionary) strucRootDict.getDictionaryObject(COSName.ROLE_MAP); if (roleMap != null) { merger.setRoleMap(roleMap); } merger.copyStructure(markedContentsParents); } } configureCurrentSessionElem(srcDoc); }
From source file:org.apache.fop.render.pdf.pdfbox.TaggedPDFConductor.java
License:Apache License
private boolean isInputPDFTagged(PDDocument srcDoc) { PDMarkInfo mark = srcDoc.getDocumentCatalog().getMarkInfo(); return mark != null && mark.isMarked(); }
From source file:org.apache.fop.render.pdf.pdfbox.TaggedPDFConductor.java
License:Apache License
private boolean isStructureTreeRootNull(PDDocument srcDoc) { return srcDoc.getDocumentCatalog().getStructureTreeRoot() != null; }
From source file:org.apache.fop.render.pdf.PDFBoxAdapterTestCase.java
License:Apache License
private String writeText(FontInfo fi, String pdf) throws IOException { PDDocument doc = getResource(pdf); PDPage page = (PDPage) doc.getDocumentCatalog().getPages().get(0); AffineTransform at = new AffineTransform(); String c = getPDFBoxAdapter().createStreamFromPDFBoxPage(doc, page, pdf, at, fi, new Rectangle()); // PDResources sourcePageResources = page.findResources(); // COSDictionary fonts = (COSDictionary)sourcePageResources.getCOSDictionary().getDictionaryObject(COSName.FONT); // PDFBoxAdapter.PDFWriter w = adapter. new MergeFontsPDFWriter(fonts, fi, "", new ArrayList<COSName>()); // String c = w.writeText(page.getContents()); doc.close();//from w w w.j av a 2s.c o m return c; }