List of usage examples for org.apache.pdfbox.pdmodel PDDocumentCatalog getNames
public PDDocumentNameDictionary getNames()
From source file:com.fangxin365.core.utils.PDFMerger.java
License:Apache License
/** * append all pages from source to destination. * /*from w w w .j ava 2 s . c o m*/ * @param destination * the document to receive the pages * @param source * the document originating the new pages * * @throws IOException * If there is an error accessing data from either document. */ public void appendDocument(PDDocument destination, PDDocument source) throws IOException { if (destination.isEncrypted()) { System.out.println("Error: destination PDF is encrypted, can't append encrypted PDF documents."); } if (source.isEncrypted()) { System.out.println("Error: source PDF is encrypted, can't append encrypted PDF documents."); } PDDocumentInformation destInfo = destination.getDocumentInformation(); PDDocumentInformation srcInfo = source.getDocumentInformation(); destInfo.getDictionary().mergeInto(srcInfo.getDictionary()); PDDocumentCatalog destCatalog = destination.getDocumentCatalog(); PDDocumentCatalog srcCatalog = source.getDocumentCatalog(); // use the highest version number for the resulting pdf float destVersion = destination.getDocument().getVersion(); float srcVersion = source.getDocument().getVersion(); if (destVersion < srcVersion) { destination.getDocument().setVersion(srcVersion); } if (destCatalog.getOpenAction() == null) { destCatalog.setOpenAction(srcCatalog.getOpenAction()); } // maybe there are some shared resources for all pages COSDictionary srcPages = (COSDictionary) srcCatalog.getCOSDictionary().getDictionaryObject(COSName.PAGES); COSDictionary srcResources = (COSDictionary) srcPages.getDictionaryObject(COSName.RESOURCES); COSDictionary destPages = (COSDictionary) destCatalog.getCOSDictionary().getDictionaryObject(COSName.PAGES); COSDictionary destResources = (COSDictionary) destPages.getDictionaryObject(COSName.RESOURCES); if (srcResources != null) { if (destResources != null) { destResources.mergeInto(srcResources); } else { destPages.setItem(COSName.RESOURCES, srcResources); } } PDFCloneUtility cloner = new PDFCloneUtility(destination); try { PDAcroForm destAcroForm = destCatalog.getAcroForm(); PDAcroForm srcAcroForm = srcCatalog.getAcroForm(); if (destAcroForm == null) { cloner.cloneForNewDocument(srcAcroForm); destCatalog.setAcroForm(srcAcroForm); } else { if (srcAcroForm != null) { mergeAcroForm(cloner, destAcroForm, srcAcroForm); } } } catch (Exception e) { // if we are not ignoring exceptions, we'll re-throw this if (!ignoreAcroFormErrors) { throw (IOException) e; } } COSArray destThreads = (COSArray) destCatalog.getCOSDictionary().getDictionaryObject(COSName.THREADS); COSArray srcThreads = (COSArray) cloner .cloneForNewDocument(destCatalog.getCOSDictionary().getDictionaryObject(COSName.THREADS)); if (destThreads == null) { destCatalog.getCOSDictionary().setItem(COSName.THREADS, srcThreads); } else { destThreads.addAll(srcThreads); } PDDocumentNameDictionary destNames = destCatalog.getNames(); PDDocumentNameDictionary srcNames = srcCatalog.getNames(); if (srcNames != null) { if (destNames == null) { destCatalog.getCOSDictionary().setItem(COSName.NAMES, cloner.cloneForNewDocument(srcNames)); } else { cloner.cloneMerge(srcNames, destNames); } } PDDocumentOutline destOutline = destCatalog.getDocumentOutline(); PDDocumentOutline srcOutline = srcCatalog.getDocumentOutline(); if (srcOutline != null) { if (destOutline == null) { PDDocumentOutline cloned = new PDDocumentOutline( (COSDictionary) cloner.cloneForNewDocument(srcOutline)); destCatalog.setDocumentOutline(cloned); } else { PDOutlineItem first = srcOutline.getFirstChild(); if (first != null) { PDOutlineItem clonedFirst = new PDOutlineItem( (COSDictionary) cloner.cloneForNewDocument(first)); destOutline.appendChild(clonedFirst); } } } String destPageMode = destCatalog.getPageMode(); String srcPageMode = srcCatalog.getPageMode(); if (destPageMode == null) { destCatalog.setPageMode(srcPageMode); } COSDictionary destLabels = (COSDictionary) destCatalog.getCOSDictionary() .getDictionaryObject(COSName.PAGE_LABELS); COSDictionary srcLabels = (COSDictionary) srcCatalog.getCOSDictionary() .getDictionaryObject(COSName.PAGE_LABELS); if (srcLabels != null) { int destPageCount = destination.getNumberOfPages(); COSArray destNums = null; if (destLabels == null) { destLabels = new COSDictionary(); destNums = new COSArray(); destLabels.setItem(COSName.NUMS, destNums); destCatalog.getCOSDictionary().setItem(COSName.PAGE_LABELS, destLabels); } else { destNums = (COSArray) destLabels.getDictionaryObject(COSName.NUMS); } COSArray srcNums = (COSArray) srcLabels.getDictionaryObject(COSName.NUMS); if (srcNums != null) { for (int i = 0; i < srcNums.size(); i += 2) { COSNumber labelIndex = (COSNumber) srcNums.getObject(i); long labelIndexValue = labelIndex.intValue(); destNums.add(COSInteger.get(labelIndexValue + destPageCount)); destNums.add(cloner.cloneForNewDocument(srcNums.getObject(i + 1))); } } } COSStream destMetadata = (COSStream) destCatalog.getCOSDictionary().getDictionaryObject(COSName.METADATA); COSStream srcMetadata = (COSStream) srcCatalog.getCOSDictionary().getDictionaryObject(COSName.METADATA); if (destMetadata == null && srcMetadata != null) { PDStream newStream = new PDStream(destination, srcMetadata.getUnfilteredStream(), false); newStream.getStream().mergeInto(srcMetadata); newStream.addCompression(); destCatalog.getCOSDictionary().setItem(COSName.METADATA, newStream); } // finally append the pages @SuppressWarnings("unchecked") List<PDPage> pages = srcCatalog.getAllPages(); Iterator<PDPage> pageIter = pages.iterator(); while (pageIter.hasNext()) { PDPage page = pageIter.next(); PDPage newPage = new PDPage((COSDictionary) cloner.cloneForNewDocument(page.getCOSDictionary())); newPage.setCropBox(page.findCropBox()); newPage.setMediaBox(page.findMediaBox()); newPage.setRotation(page.findRotation()); destination.addPage(newPage); } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) throws IOException, SAXException, TikaException { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); if (names == null) { return;/* www. j a va 2s . c om*/ } PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); if (embeddedFiles == null) { return; } EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); if (embeddedExtractor == null) { embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); } Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); // For now, try to get the embeddedFileNames out of embeddedFiles or its // kids. // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java // If there is a need we could add a fully recursive search to find a // non-null // Map<String, COSObjectable> that contains the doc info. if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames, embeddedExtractor); } else { List<PDNameTreeNode> kids = embeddedFiles.getKids(); if (kids == null) { return; } for (PDNameTreeNode n : kids) { Map<String, COSObjectable> childNames = n.getNames(); if (childNames != null) { processEmbeddedDocNames(childNames, embeddedExtractor); } } } }
From source file:net.padaf.preflight.helpers.CatalogValidationHelper.java
License:Apache License
/** * A Catalog shall not contain the EmbeddedFiles entry. * //from w w w . j a va 2 s . c o m * @param handler * @param catalog * @param result * @throws ValidationException */ protected void validateNames(DocumentHandler handler, PDDocumentCatalog catalog, List<ValidationError> result) throws ValidationException { PDDocumentNameDictionary names = catalog.getNames(); if (names != null) { PDEmbeddedFilesNameTreeNode efs = names.getEmbeddedFiles(); if (efs != null) { result.add(new ValidationError(ERROR_SYNTAX_TRAILER_CATALOG_EMBEDDEDFILES, "EmbeddedFile entry is present in the Names dictionary")); } } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) throws IOException, SAXException, TikaException { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); if (names == null) { return;/*from w ww .j ava2 s .c om*/ } PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); if (embeddedFiles == null) { return; } Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java //If there is a need we could add a fully recursive search to find a non-null //Map<String, COSObjectable> that contains the doc info. if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } else { List<PDNameTreeNode> kids = embeddedFiles.getKids(); if (kids == null) { return; } for (PDNameTreeNode n : kids) { Map<String, COSObjectable> childNames = n.getNames(); if (childNames != null) { processEmbeddedDocNames(childNames); } } } }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract the content of embedded files from a PDF document. *///from w ww .ja v a2 s . c o m protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc) throws IOException { final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog(); if (pddDocCatalog == null) return; final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames(); if (nameDic == null) return; final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles(); if (embeddedFiles == null) return; @SuppressWarnings("unchecked") final Map<String, Object> names = embeddedFiles.getNames(); if (names == null || names.isEmpty()) return; final IParserContext context = this.contextLocal.getCurrentContext(); for (Entry<String, Object> name : names.entrySet()) { // final String fileDesc = name.getKey(); final Object fileObj = name.getValue(); if (fileObj == null) continue; if (fileObj instanceof PDComplexFileSpecification) { final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj; final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile(); // getting the embedded file name and mime-type final String fileName = embeddedFileSpec.getFile(); final String fileMimeType = embeddedFile.getSubtype(); if (fileMimeType == null) { this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location, fileName)); continue; } // getting a parser to parse the content final ISubParser sp = context.getParser(fileMimeType); if (sp == null) { this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.", location, fileName, fileMimeType)); continue; } // parsing content InputStream embeddedFileStream = null; try { embeddedFileStream = embeddedFile.createInputStream(); final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream); if (subParserDoc.getMimeType() == null) { subParserDoc.setMimeType(fileMimeType); } parserDoc.addSubDocument(fileName, subParserDoc); } catch (ParserException e) { this.logger.error(String.format( "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s", location, fileName, fileMimeType, e.getMessage())); } finally { if (embeddedFileStream != null) try { embeddedFileStream.close(); } catch (Exception e) { this.logger.error(e); } } } } }