Example usage for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile createInputStream

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile createInputStream.

Prototype

public COSInputStream createInputStream() throws IOException

Source Link

Document

This will get a stream that can be read from.

Usage

From source file:algorithm.PDFFileAttacher.java

License:Apache License

@Override
public List<RestoredFile> restore(File originalPdf) throws IOException {
    RestoredFile copiedPdf = getRestoredCarrier(originalPdf);
    List<RestoredFile> restoredFiles = new ArrayList<RestoredFile>();
    PDDocument document = PDDocument.load(copiedPdf);
    PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode filesTree = namesDictionary.getEmbeddedFiles();
    if (filesTree != null) {
        int i = 0;
        while (true) {
            PDComplexFileSpecification fileSpecification = (PDComplexFileSpecification) filesTree
                    .getValue("PericlesMetadata-" + i);
            if (fileSpecification == null) {
                break;
            }/*from   w  ww  . j  av  a 2 s  .co m*/
            File oldAttachedFile = new File(fileSpecification.getFile());
            RestoredFile restoredPayload = new RestoredFile(RESTORED_DIRECTORY + oldAttachedFile.getName());
            PDEmbeddedFile embeddedFile = fileSpecification.getEmbeddedFile();
            InputStream inputStream = embeddedFile.createInputStream();
            FileOutputStream outputStream = new FileOutputStream(restoredPayload);
            IOUtils.copy(inputStream, outputStream);
            removeBuggyLineEnding(restoredPayload);
            restoredPayload.wasPayload = true;
            restoredPayload.checksumValid = true;
            restoredPayload.restorationNote = "Checksum wasn't calculated, because this algorithm isn't using restoration metadata. The original payload file survives the encapsulation with this algorithm.";
            restoredFiles.add(restoredPayload);
            i++;
        }
    }
    document.close();
    copiedPdf.wasCarrier = true;
    copiedPdf.checksumValid = false;
    copiedPdf.restorationNote = "Checksum can't be valid, because attached payload files can't be removed from carrier.";
    restoredFiles.add(copiedPdf);
    for (RestoredFile file : restoredFiles) {
        file.algorithm = this;
        for (RestoredFile relatedFile : restoredFiles) {
            if (file != relatedFile) {
                file.relatedFiles.add(relatedFile);
            }
        }
    }
    return restoredFiles;
}

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames,
        EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException {
    if (embeddedFileNames == null) {
        return;/*from  w w  w  .  j  a  v a2s .  c  om*/
    }
    for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) {
        PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
        PDEmbeddedFile file = spec.getEmbeddedFile();

        Metadata metadata = new Metadata();
        // TODO: other metadata?
        metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
            TikaInputStream stream = TikaInputStream.get(file.createInputStream());
            try {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            } finally {
                stream.close();
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName,
        PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;/* w w w.  j  a  v a  2s . c o m*/
    }

    fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
    fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;

    // TODO: other metadata?
    Metadata embeddedMetadata = new Metadata();
    embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
    embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
    if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
        return;
    }
    TikaInputStream stream = null;
    try {
        stream = TikaInputStream.get(file.createInputStream());
    } catch (IOException e) {
        //store this exception in the parent's metadata
        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        return;
    }
    try {
        embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
                false);

        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
        attributes.addAttribute("", "id", "id", "CDATA", fileName);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
    } finally {
        IOUtils.closeQuietly(stream);
    }

}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
        EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;/*from  ww w  . jav a 2s .com*/
    }

    fileName = (fileName == null) ? defaultName : fileName;

    // TODO: other metadata?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());

    if (extractor.shouldParseEmbedded(metadata)) {
        TikaInputStream stream = null;
        try {
            stream = TikaInputStream.get(file.createInputStream());
            extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);

            AttributesImpl attributes = new AttributesImpl();
            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
            attributes.addAttribute("", "id", "id", "CDATA", fileName);
            handler.startElement("div", attributes);
            handler.endElement("div");
        } finally {
            IOUtils.closeQuietly(stream);
        }
    }
}

From source file:org.paxle.parser.pdf.impl.PdfParser.java

License:Open Source License

/**
 * A function to extract the content of embedded files from a PDF document.
 *//*from  w  ww  .  jav a 2  s .c  o  m*/
protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc)
        throws IOException {
    final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog();
    if (pddDocCatalog == null)
        return;

    final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames();
    if (nameDic == null)
        return;

    final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles();
    if (embeddedFiles == null)
        return;

    @SuppressWarnings("unchecked")
    final Map<String, Object> names = embeddedFiles.getNames();
    if (names == null || names.isEmpty())
        return;

    final IParserContext context = this.contextLocal.getCurrentContext();

    for (Entry<String, Object> name : names.entrySet()) {
        // final String fileDesc = name.getKey();
        final Object fileObj = name.getValue();
        if (fileObj == null)
            continue;

        if (fileObj instanceof PDComplexFileSpecification) {
            final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj;
            final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile();

            // getting the embedded file name and mime-type
            final String fileName = embeddedFileSpec.getFile();
            final String fileMimeType = embeddedFile.getSubtype();
            if (fileMimeType == null) {
                this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location,
                        fileName));
                continue;
            }

            // getting a parser to parse the content
            final ISubParser sp = context.getParser(fileMimeType);
            if (sp == null) {
                this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.",
                        location, fileName, fileMimeType));
                continue;
            }

            // parsing content
            InputStream embeddedFileStream = null;
            try {
                embeddedFileStream = embeddedFile.createInputStream();
                final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream);
                if (subParserDoc.getMimeType() == null) {
                    subParserDoc.setMimeType(fileMimeType);
                }

                parserDoc.addSubDocument(fileName, subParserDoc);
            } catch (ParserException e) {
                this.logger.error(String.format(
                        "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s",
                        location, fileName, fileMimeType, e.getMessage()));
            } finally {
                if (embeddedFileStream != null)
                    try {
                        embeddedFileStream.close();
                    } catch (Exception e) {
                        this.logger.error(e);
                    }
            }
        }
    }
}