Example usage for org.apache.pdfbox.pdmodel.common.filespecification PDComplexFileSpecification getEmbeddedFile

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common.filespecification PDComplexFileSpecification getEmbeddedFile.

Prototype

public PDEmbeddedFile getEmbeddedFile()

Source Link

Document

Get the embedded file.

Usage

From source file:algorithm.PDFFileAttacher.java

License:Apache License

@Override
public List<RestoredFile> restore(File originalPdf) throws IOException {
    RestoredFile copiedPdf = getRestoredCarrier(originalPdf);
    List<RestoredFile> restoredFiles = new ArrayList<RestoredFile>();
    PDDocument document = PDDocument.load(copiedPdf);
    PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode filesTree = namesDictionary.getEmbeddedFiles();
    if (filesTree != null) {
        int i = 0;
        while (true) {
            PDComplexFileSpecification fileSpecification = (PDComplexFileSpecification) filesTree
                    .getValue("PericlesMetadata-" + i);
            if (fileSpecification == null) {
                break;
            }//from www  . j  a  v a  2  s . c o  m
            File oldAttachedFile = new File(fileSpecification.getFile());
            RestoredFile restoredPayload = new RestoredFile(RESTORED_DIRECTORY + oldAttachedFile.getName());
            PDEmbeddedFile embeddedFile = fileSpecification.getEmbeddedFile();
            InputStream inputStream = embeddedFile.createInputStream();
            FileOutputStream outputStream = new FileOutputStream(restoredPayload);
            IOUtils.copy(inputStream, outputStream);
            removeBuggyLineEnding(restoredPayload);
            restoredPayload.wasPayload = true;
            restoredPayload.checksumValid = true;
            restoredPayload.restorationNote = "Checksum wasn't calculated, because this algorithm isn't using restoration metadata. The original payload file survives the encapsulation with this algorithm.";
            restoredFiles.add(restoredPayload);
            i++;
        }
    }
    document.close();
    copiedPdf.wasCarrier = true;
    copiedPdf.checksumValid = false;
    copiedPdf.restorationNote = "Checksum can't be valid, because attached payload files can't be removed from carrier.";
    restoredFiles.add(copiedPdf);
    for (RestoredFile file : restoredFiles) {
        file.algorithm = this;
        for (RestoredFile relatedFile : restoredFiles) {
            if (file != relatedFile) {
                file.relatedFiles.add(relatedFile);
            }
        }
    }
    return restoredFiles;
}

From source file:dev.ztgnrw.ExtractEmbeddedFiles.java

License:Apache License

private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) {
    // search for the first available alternative of the embedded file
    PDEmbeddedFile embeddedFile = null;//from  ww w.j  a v a 2  s  . co m
    if (fileSpec != null) {
        embeddedFile = fileSpec.getEmbeddedFileUnicode();
        if (embeddedFile == null) {
            embeddedFile = fileSpec.getEmbeddedFileDos();
        }
        if (embeddedFile == null) {
            embeddedFile = fileSpec.getEmbeddedFileMac();
        }
        if (embeddedFile == null) {
            embeddedFile = fileSpec.getEmbeddedFileUnix();
        }
        if (embeddedFile == null) {
            embeddedFile = fileSpec.getEmbeddedFile();
        }
    }
    return embeddedFile;
}

From source file:eu.europa.ejusticeportal.dss.controller.signature.PdfUtils.java

License:EUPL

/**
 * Extracts an attachment from a PDF/*from  w ww  . ja v  a  2  s.co  m*/
 * 
 * @param pdf the PDF; this is unaffected by the method
 * @param name the name of the attachment
 * @return the attachment
 */
public static byte[] extractAttachment(final byte[] pdf, final String name) {
    PDDocument doc = null;
    try {
        InputStream is = new ByteArrayInputStream(pdf);
        doc = PDDocument.load(is);
        PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog());
        PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
        Map<String, COSObjectable> names = efTree.getNames();
        byte[] data = null;
        if (names != null) {
            PDComplexFileSpecification attach = (PDComplexFileSpecification) names.get(name);
            data = attach.getEmbeddedFile().getByteArray();
        }

        // Remove the \r\n added by PdfBox (PDDocument.saveIncremental).
        if (data != null) {
            String newString = new String(data, "UTF-8");
            while (newString.endsWith("\r\n")) {
                newString = newString.replaceAll("\\r\\n$", "");
                data = newString.getBytes(Charset.forName("UTF-8"));
            }
        }

        return data;

    } catch (IOException e) {
        LOGGER.error("Error detaching.", e);
        throw new SigningException(e);
    } finally {
        closeQuietly(doc);
    }
}

From source file:io.konik.carriage.pdfbox.PDFBoxInvoiceExtractor.java

License:Open Source License

private static final InputStream extractZugferdXmlAttachment(PDEmbeddedFilesNameTreeNode embeddedFiles)
        throws IOException {
    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) embeddedFiles.getValue(ZF_FILE_NAME);
    if (fileSpec == null) {
        throw new InvoiceExtractionError(NO_ZF_FILE + ZF_FILE_NAME);
    }/*  w  w  w  . ja va2s .  co  m*/
    return fileSpec.getEmbeddedFile().createInputStream();
}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames,
        EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException {
    if (embeddedFileNames == null) {
        return;//from  w ww. j a  va2  s.c o m
    }
    for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) {
        PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
        PDEmbeddedFile file = spec.getEmbeddedFile();

        Metadata metadata = new Metadata();
        // TODO: other metadata?
        metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
            TikaInputStream stream = TikaInputStream.get(file.createInputStream());
            try {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            } finally {
                stream.close();
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

private void extractMultiOSPDEmbeddedFiles(String displayName, PDComplexFileSpecification spec,
        AttributesImpl attributes) throws IOException, SAXException, TikaException {

    if (spec == null) {
        return;/* ww  w . j a v a2  s. c  om*/
    }
    //current strategy is to pull all, not just first non-null
    extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(),
            attributes);
    extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(),
            attributes);
    extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(),
            attributes);
    extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(),
            attributes);
}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractMultiOSPDEmbeddedFiles(String defaultName, PDComplexFileSpecification spec,
        EmbeddedDocumentExtractor extractor) throws IOException, SAXException, TikaException {

    if (spec == null) {
        return;//from ww w. jav a  2  s.com
    }
    //current strategy is to pull all, not just first non-null
    extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
    extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
    extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
    extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
}

From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java

License:Apache License

public boolean check() throws Exception {
    try (PDDocument document = PDDocument.load(in)) {
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDPageable pageable = new PDPageable(document);
        PageFormat firstPage = pageable.getPageFormat(0);

        encrypted = document.isEncrypted();
        pageCount = document.getNumberOfPages();
        orientation = ORIENTATION_STRINGS[firstPage.getOrientation()];
        version = String.valueOf(document.getDocument().getVersion());
        String catalogVersion = catalog.getVersion();
        if (catalogVersion != null && !catalogVersion.isEmpty()) {
            // According to specs version saved here should be determining instead
            // the version in header. It is barely used, though.
            version = catalogVersion;/*from  www  .j  av a  2  s  .  c  om*/
        }

        if (!encrypted) {
            PDDocumentInformation metadata = document.getDocumentInformation();
            author = metadata.getAuthor();
            creationDate = metadata.getCreationDate();
            creator = metadata.getCreator();
            keywords = metadata.getKeywords();
            modificationDate = metadata.getModificationDate();
            producer = metadata.getProducer();
            subject = metadata.getSubject();
            title = metadata.getTitle();
        }

        // extract all attached files from all pages
        int pageNumber = 0;
        for (Object page : catalog.getAllPages()) {
            pageNumber += 1;
            PdfPageMetadata pageMetadata = new PdfPageMetadata();
            pageMetadata.setPageNumber(pageNumber);
            for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata();

                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                    PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();

                    attachmentMetadata.setSubject(fann.getSubject());
                    attachmentMetadata.setName(fileSpec.getFilename());
                    attachmentMetadata.setCreationDate(embeddedFile.getCreationDate());
                    attachmentMetadata.setModificationDate(embeddedFile.getModDate());
                    attachmentMetadata.setMimeType(embeddedFile.getSubtype());
                    attachmentMetadata.setData(embeddedFile.getByteArray());

                    pageMetadata.addAttachment(attachmentMetadata);
                }
            }
            pages.add(pageMetadata);
        }
        return true;
    }
}

From source file:org.mustangproject.ZUGFeRD.ZUGFeRDImporter.java

License:Open Source License

private void extractFiles(Map<String, PDComplexFileSpecification> names) throws IOException {
    for (String filename : names.keySet()) {
        /**//from   w  w  w. j  a va2 s .c  o  m
         * currently (in the release candidate of version 1) only one attached file with
         * the name ZUGFeRD-invoice.xml is allowed
         */
        if ((filename.equals("ZUGFeRD-invoice.xml") || (filename.equals("zugferd-invoice.xml")) //$NON-NLS-1$
                || filename.equals("factur-x.xml"))) {
            containsMeta = true;

            PDComplexFileSpecification fileSpec = names.get(filename);
            PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();
            // String embeddedFilename = filePath + filename;
            // File file = new File(filePath + filename);
            // System.out.println("Writing " + embeddedFilename);
            // ByteArrayOutputStream fileBytes=new
            // ByteArrayOutputStream();
            // FileOutputStream fos = new FileOutputStream(file);

            rawXML = embeddedFile.toByteArray();
            setMeta(new String(rawXML));

            // fos.write(embeddedFile.getByteArray());
            // fos.close();
        }
        if (filename.startsWith("additional_data")) {

            PDComplexFileSpecification fileSpec = names.get(filename);
            PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();
            additionalXMLs.put(filename, embeddedFile.toByteArray());

        }
    }
}

From source file:org.paxle.parser.pdf.impl.PdfParser.java

License:Open Source License

/**
 * A function to extract the content of embedded files from a PDF document.
 *///from ww  w  . ja v  a2s .  co  m
protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc)
        throws IOException {
    final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog();
    if (pddDocCatalog == null)
        return;

    final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames();
    if (nameDic == null)
        return;

    final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles();
    if (embeddedFiles == null)
        return;

    @SuppressWarnings("unchecked")
    final Map<String, Object> names = embeddedFiles.getNames();
    if (names == null || names.isEmpty())
        return;

    final IParserContext context = this.contextLocal.getCurrentContext();

    for (Entry<String, Object> name : names.entrySet()) {
        // final String fileDesc = name.getKey();
        final Object fileObj = name.getValue();
        if (fileObj == null)
            continue;

        if (fileObj instanceof PDComplexFileSpecification) {
            final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj;
            final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile();

            // getting the embedded file name and mime-type
            final String fileName = embeddedFileSpec.getFile();
            final String fileMimeType = embeddedFile.getSubtype();
            if (fileMimeType == null) {
                this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location,
                        fileName));
                continue;
            }

            // getting a parser to parse the content
            final ISubParser sp = context.getParser(fileMimeType);
            if (sp == null) {
                this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.",
                        location, fileName, fileMimeType));
                continue;
            }

            // parsing content
            InputStream embeddedFileStream = null;
            try {
                embeddedFileStream = embeddedFile.createInputStream();
                final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream);
                if (subParserDoc.getMimeType() == null) {
                    subParserDoc.setMimeType(fileMimeType);
                }

                parserDoc.addSubDocument(fileName, subParserDoc);
            } catch (ParserException e) {
                this.logger.error(String.format(
                        "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s",
                        location, fileName, fileMimeType, e.getMessage()));
            } finally {
                if (embeddedFileStream != null)
                    try {
                        embeddedFileStream.close();
                    } catch (Exception e) {
                        this.logger.error(e);
                    }
            }
        }
    }
}