Example usage for org.apache.pdfbox.pdmodel.common PDNameTreeNode getNames

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common PDNameTreeNode getNames.

Prototype

public Map<String, T> getNames() throws IOException

Source Link

Document

This will return a map of names on this level.

Usage

From source file:dev.ztgnrw.ExtractEmbeddedFiles.java

License:Apache License

/**
 * This is the main method.//from  w w  w. j a  v a 2 s .  co  m
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static void extractEmbeddedFiles(String file) throws IOException {

    PDDocument document = null;
    try {
        File pdfFile = new File(file);
        String filePath = pdfFile.getParent() + System.getProperty("file.separator");
        document = PDDocument.load(pdfFile);
        PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
        PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
        if (efTree != null) {
            Map<String, PDComplexFileSpecification> names = efTree.getNames();
            if (names != null) {
                extractFiles(names, filePath);
            } else {
                List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
                for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                    names = node.getNames();
                    extractFiles(names, filePath);
                }
            }
        }

        // extract files from annotations
        for (PDPage page : document.getPages()) {
            for (PDAnnotation annotation : page.getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment
                            .getFile();
                    PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
                    extractFile(filePath, fileSpec.getFilename(), embeddedFile);
                }
            }
        }

    } finally {
        if (document != null) {
            document.close();
        }
    }

}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
        throws IOException, SAXException, TikaException {
    PDDocumentCatalog catalog = document.getDocumentCatalog();
    PDDocumentNameDictionary names = catalog.getNames();
    if (names == null) {
        return;/*from   w  ww .  ja  v  a2s  . co m*/
    }
    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();

    if (embeddedFiles == null) {
        return;
    }

    EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
    if (embeddedExtractor == null) {
        embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
    }

    Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
    // For now, try to get the embeddedFileNames out of embeddedFiles or its
    // kids.
    // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    // If there is a need we could add a fully recursive search to find a
    // non-null
    // Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames, embeddedExtractor);
    } else {
        List<PDNameTreeNode> kids = embeddedFiles.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode n : kids) {
            Map<String, COSObjectable> childNames = n.getNames();
            if (childNames != null) {
                processEmbeddedDocNames(childNames, embeddedExtractor);
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
        return;// ww  w  .j a v  a  2 s.c o m
    }

    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    //If there is a need we could add a fully recursive search to find a non-null
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
            embeddedFileNames = node.getNames();
            if (embeddedFileNames != null) {
                processEmbeddedDocNames(embeddedFileNames);
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
        throws IOException, SAXException, TikaException {
    PDDocumentCatalog catalog = document.getDocumentCatalog();
    PDDocumentNameDictionary names = catalog.getNames();
    if (names == null) {
        return;//from w  ww  .  java2  s .  co m
    }
    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();

    if (embeddedFiles == null) {
        return;
    }

    Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
    //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    //If there is a need we could add a fully recursive search to find a non-null
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode> kids = embeddedFiles.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode n : kids) {
            Map<String, COSObjectable> childNames = n.getNames();
            if (childNames != null) {
                processEmbeddedDocNames(childNames);
            }
        }
    }
}

From source file:org.mustangproject.ZUGFeRD.ZUGFeRDImporter.java

License:Open Source License

/**
 * Extracts a ZUGFeRD invoice from a PDF document represented by an input
 * stream. Errors are reported via exception handling.
 *
 * @param pdfStream a inputstream of a pdf file
 *//*from  w  ww  . j a va  2 s  .  c o m*/
private void extractLowLevel(InputStream pdfStream) throws IOException {
    try (PDDocument doc = PDDocument.load(pdfStream)) {
        // PDDocumentInformation info = doc.getDocumentInformation();
        PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        //start
        InputStream XMP = doc.getDocumentCatalog().getMetadata().exportXMPMetadata();

        xmpString = convertStreamToString(XMP);

        PDEmbeddedFilesNameTreeNode etn = names.getEmbeddedFiles();
        if (etn == null) {
            return;
        }

        Map<String, PDComplexFileSpecification> efMap = etn.getNames();
        // String filePath = "/tmp/";

        if (efMap != null) {
            extractFiles(efMap); // see
            // https://memorynotfound.com/apache-pdfbox-extract-embedded-file-pdf-document/
        } else {

            List<PDNameTreeNode<PDComplexFileSpecification>> kids = etn.getKids();
            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                Map<String, PDComplexFileSpecification> namesL = node.getNames();
                extractFiles(namesL);
            }
        }
    }
}

From source file:org.pdfmetamodifier.IOHelper.java

License:Apache License

/**
 * Save all Attached (embedded) files to some directory.
 * /*from   w  w  w .  j  a va 2 s .c o  m*/
 * @param pdfFile
 *            Source PDF file.
 * @param outputDir
 *            Target directory.
 * @throws IOException
 */
/*
 * See:
 *      https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java?view=markup
 */
public static void saveAttachments(final File pdfFile, final File outputDir) throws IOException {
    PDDocument document = null;
    try {
        // Read PDF file.
        document = PDDocument.load(pdfFile);
        if (document.isEncrypted()) {
            throw new IOException("Document is encrypted.");
        }

        // Extract Embedded (attached) files.
        final PDDocumentNameDictionary documentNameDictionary = new PDDocumentNameDictionary(
                document.getDocumentCatalog());
        final PDEmbeddedFilesNameTreeNode embeddedFilesNameTree = documentNameDictionary.getEmbeddedFiles();
        if (embeddedFilesNameTree != null) {
            extractFiles(outputDir, embeddedFilesNameTree.getNames());

            final List<PDNameTreeNode<PDComplexFileSpecification>> kids = embeddedFilesNameTree.getKids();
            if (kids != null) {
                for (PDNameTreeNode<PDComplexFileSpecification> nameTreeNode : kids) {
                    extractFiles(outputDir, nameTreeNode.getNames());
                }
            }
        }

        // Extract Embedded (attached) from annotations.
        for (PDPage page : document.getPages()) {
            for (PDAnnotation annotation : page.getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    final PDAnnotationFileAttachment fileAttach = (PDAnnotationFileAttachment) annotation;

                    final PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fileAttach
                            .getFile();
                    extractFile(outputDir, fileSpec);
                }
            }
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}