List of usage examples for org.apache.pdfbox.pdmodel.common PDNameTreeNode getNames
public Map<String, T> getNames() throws IOException
From source file:dev.ztgnrw.ExtractEmbeddedFiles.java
License:Apache License
/** * This is the main method.//from w w w. j a v a 2 s . co m * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. */ public static void extractEmbeddedFiles(String file) throws IOException { PDDocument document = null; try { File pdfFile = new File(file); String filePath = pdfFile.getParent() + System.getProperty("file.separator"); document = PDDocument.load(pdfFile); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); if (efTree != null) { Map<String, PDComplexFileSpecification> names = efTree.getNames(); if (names != null) { extractFiles(names, filePath); } else { List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { names = node.getNames(); extractFiles(names, filePath); } } } // extract files from annotations for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment .getFile(); PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec); extractFile(filePath, fileSpec.getFilename(), embeddedFile); } } } } finally { if (document != null) { document.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) throws IOException, SAXException, TikaException { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); if (names == null) { return;/*from w ww . ja v a2s . co m*/ } PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); if (embeddedFiles == null) { return; } EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); if (embeddedExtractor == null) { embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); } Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); // For now, try to get the embeddedFileNames out of embeddedFiles or its // kids. // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java // If there is a need we could add a fully recursive search to find a // non-null // Map<String, COSObjectable> that contains the doc info. if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames, embeddedExtractor); } else { List<PDNameTreeNode> kids = embeddedFiles.getKids(); if (kids == null) { return; } for (PDNameTreeNode n : kids) { Map<String, COSObjectable> childNames = n.getNames(); if (childNames != null) { processEmbeddedDocNames(childNames, embeddedExtractor); } } } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException { PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); if (efTree == null) { return;// ww w .j a v a 2 s.c o m } Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java //If there is a need we could add a fully recursive search to find a non-null //Map<String, COSObjectable> that contains the doc info. if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } else { List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); if (kids == null) { return; } for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { embeddedFileNames = node.getNames(); if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } } } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) throws IOException, SAXException, TikaException { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); if (names == null) { return;//from w ww . java2 s . co m } PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); if (embeddedFiles == null) { return; } Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java //If there is a need we could add a fully recursive search to find a non-null //Map<String, COSObjectable> that contains the doc info. if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } else { List<PDNameTreeNode> kids = embeddedFiles.getKids(); if (kids == null) { return; } for (PDNameTreeNode n : kids) { Map<String, COSObjectable> childNames = n.getNames(); if (childNames != null) { processEmbeddedDocNames(childNames); } } } }
From source file:org.mustangproject.ZUGFeRD.ZUGFeRDImporter.java
License:Open Source License
/** * Extracts a ZUGFeRD invoice from a PDF document represented by an input * stream. Errors are reported via exception handling. * * @param pdfStream a inputstream of a pdf file *//*from w ww . j a va 2 s . c o m*/ private void extractLowLevel(InputStream pdfStream) throws IOException { try (PDDocument doc = PDDocument.load(pdfStream)) { // PDDocumentInformation info = doc.getDocumentInformation(); PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog()); //start InputStream XMP = doc.getDocumentCatalog().getMetadata().exportXMPMetadata(); xmpString = convertStreamToString(XMP); PDEmbeddedFilesNameTreeNode etn = names.getEmbeddedFiles(); if (etn == null) { return; } Map<String, PDComplexFileSpecification> efMap = etn.getNames(); // String filePath = "/tmp/"; if (efMap != null) { extractFiles(efMap); // see // https://memorynotfound.com/apache-pdfbox-extract-embedded-file-pdf-document/ } else { List<PDNameTreeNode<PDComplexFileSpecification>> kids = etn.getKids(); for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { Map<String, PDComplexFileSpecification> namesL = node.getNames(); extractFiles(namesL); } } } }
From source file:org.pdfmetamodifier.IOHelper.java
License:Apache License
/** * Save all Attached (embedded) files to some directory. * /*from w w w . j a va 2 s .c o m*/ * @param pdfFile * Source PDF file. * @param outputDir * Target directory. * @throws IOException */ /* * See: * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java?view=markup */ public static void saveAttachments(final File pdfFile, final File outputDir) throws IOException { PDDocument document = null; try { // Read PDF file. document = PDDocument.load(pdfFile); if (document.isEncrypted()) { throw new IOException("Document is encrypted."); } // Extract Embedded (attached) files. final PDDocumentNameDictionary documentNameDictionary = new PDDocumentNameDictionary( document.getDocumentCatalog()); final PDEmbeddedFilesNameTreeNode embeddedFilesNameTree = documentNameDictionary.getEmbeddedFiles(); if (embeddedFilesNameTree != null) { extractFiles(outputDir, embeddedFilesNameTree.getNames()); final List<PDNameTreeNode<PDComplexFileSpecification>> kids = embeddedFilesNameTree.getKids(); if (kids != null) { for (PDNameTreeNode<PDComplexFileSpecification> nameTreeNode : kids) { extractFiles(outputDir, nameTreeNode.getNames()); } } } // Extract Embedded (attached) from annotations. for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { final PDAnnotationFileAttachment fileAttach = (PDAnnotationFileAttachment) annotation; final PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fileAttach .getFile(); extractFile(outputDir, fileSpec); } } } } finally { if (document != null) { document.close(); } } }