Example usage for org.apache.pdfbox.pdmodel.common PDNameTreeNode getNames

List of usage examples for org.apache.pdfbox.pdmodel.common PDNameTreeNode getNames

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common PDNameTreeNode getNames.

Prototype

public Map<String, T> getNames() throws IOException 

Source Link

Document

This will return a map of names on this level.

Usage

From source file:dev.ztgnrw.ExtractEmbeddedFiles.java

License:Apache License

/**
 * This is the main method.//from  w w  w. j a  v a 2 s .  co  m
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 */
public static void extractEmbeddedFiles(String file) throws IOException {

    PDDocument document = null;
    try {
        File pdfFile = new File(file);
        String filePath = pdfFile.getParent() + System.getProperty("file.separator");
        document = PDDocument.load(pdfFile);
        PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
        PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
        if (efTree != null) {
            Map<String, PDComplexFileSpecification> names = efTree.getNames();
            if (names != null) {
                extractFiles(names, filePath);
            } else {
                List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
                for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                    names = node.getNames();
                    extractFiles(names, filePath);
                }
            }
        }

        // extract files from annotations
        for (PDPage page : document.getPages()) {
            for (PDAnnotation annotation : page.getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment
                            .getFile();
                    PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
                    extractFile(filePath, fileSpec.getFilename(), embeddedFile);
                }
            }
        }

    } finally {
        if (document != null) {
            document.close();
        }
    }

}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
        throws IOException, SAXException, TikaException {
    PDDocumentCatalog catalog = document.getDocumentCatalog();
    PDDocumentNameDictionary names = catalog.getNames();
    if (names == null) {
        return;/*from   w  ww .  ja  v  a2s  . co m*/
    }
    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();

    if (embeddedFiles == null) {
        return;
    }

    EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
    if (embeddedExtractor == null) {
        embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
    }

    Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
    // For now, try to get the embeddedFileNames out of embeddedFiles or its
    // kids.
    // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    // If there is a need we could add a fully recursive search to find a
    // non-null
    // Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames, embeddedExtractor);
    } else {
        List<PDNameTreeNode> kids = embeddedFiles.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode n : kids) {
            Map<String, COSObjectable> childNames = n.getNames();
            if (childNames != null) {
                processEmbeddedDocNames(childNames, embeddedExtractor);
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
        return;// ww  w  .j a v  a  2 s.c o m
    }

    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    //If there is a need we could add a fully recursive search to find a non-null
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
            embeddedFileNames = node.getNames();
            if (embeddedFileNames != null) {
                processEmbeddedDocNames(embeddedFileNames);
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
        throws IOException, SAXException, TikaException {
    PDDocumentCatalog catalog = document.getDocumentCatalog();
    PDDocumentNameDictionary names = catalog.getNames();
    if (names == null) {
        return;//from w  ww  .  java2  s .  co m
    }
    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();

    if (embeddedFiles == null) {
        return;
    }

    Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
    //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    //If there is a need we could add a fully recursive search to find a non-null
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode> kids = embeddedFiles.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode n : kids) {
            Map<String, COSObjectable> childNames = n.getNames();
            if (childNames != null) {
                processEmbeddedDocNames(childNames);
            }
        }
    }
}

From source file:org.mustangproject.ZUGFeRD.ZUGFeRDImporter.java

License:Open Source License

/**
 * Extracts a ZUGFeRD invoice from a PDF document represented by an input
 * stream. Errors are reported via exception handling.
 *
 * @param pdfStream a inputstream of a pdf file
 *//*from  w  ww  . j a va  2 s  .  c o m*/
private void extractLowLevel(InputStream pdfStream) throws IOException {
    try (PDDocument doc = PDDocument.load(pdfStream)) {
        // PDDocumentInformation info = doc.getDocumentInformation();
        PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        //start
        InputStream XMP = doc.getDocumentCatalog().getMetadata().exportXMPMetadata();

        xmpString = convertStreamToString(XMP);

        PDEmbeddedFilesNameTreeNode etn = names.getEmbeddedFiles();
        if (etn == null) {
            return;
        }

        Map<String, PDComplexFileSpecification> efMap = etn.getNames();
        // String filePath = "/tmp/";

        if (efMap != null) {
            extractFiles(efMap); // see
            // https://memorynotfound.com/apache-pdfbox-extract-embedded-file-pdf-document/
        } else {

            List<PDNameTreeNode<PDComplexFileSpecification>> kids = etn.getKids();
            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
                Map<String, PDComplexFileSpecification> namesL = node.getNames();
                extractFiles(namesL);
            }
        }
    }
}

From source file:org.pdfmetamodifier.IOHelper.java

License:Apache License

/**
 * Save all Attached (embedded) files to some directory.
 * /*from   w  w  w .  j  a va 2 s .c o  m*/
 * @param pdfFile
 *            Source PDF file.
 * @param outputDir
 *            Target directory.
 * @throws IOException
 */
/*
 * See:
 *      https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java?view=markup
 */
public static void saveAttachments(final File pdfFile, final File outputDir) throws IOException {
    PDDocument document = null;
    try {
        // Read PDF file.
        document = PDDocument.load(pdfFile);
        if (document.isEncrypted()) {
            throw new IOException("Document is encrypted.");
        }

        // Extract Embedded (attached) files.
        final PDDocumentNameDictionary documentNameDictionary = new PDDocumentNameDictionary(
                document.getDocumentCatalog());
        final PDEmbeddedFilesNameTreeNode embeddedFilesNameTree = documentNameDictionary.getEmbeddedFiles();
        if (embeddedFilesNameTree != null) {
            extractFiles(outputDir, embeddedFilesNameTree.getNames());

            final List<PDNameTreeNode<PDComplexFileSpecification>> kids = embeddedFilesNameTree.getKids();
            if (kids != null) {
                for (PDNameTreeNode<PDComplexFileSpecification> nameTreeNode : kids) {
                    extractFiles(outputDir, nameTreeNode.getNames());
                }
            }
        }

        // Extract Embedded (attached) from annotations.
        for (PDPage page : document.getPages()) {
            for (PDAnnotation annotation : page.getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    final PDAnnotationFileAttachment fileAttach = (PDAnnotationFileAttachment) annotation;

                    final PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fileAttach
                            .getFile();
                    extractFile(outputDir, fileSpec);
                }
            }
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}