List of usage examples for org.apache.pdfbox.pdmodel.common.filespecification PDComplexFileSpecification getEmbeddedFile
public PDEmbeddedFile getEmbeddedFile()
From source file:algorithm.PDFFileAttacher.java
License:Apache License
@Override public List<RestoredFile> restore(File originalPdf) throws IOException { RestoredFile copiedPdf = getRestoredCarrier(originalPdf); List<RestoredFile> restoredFiles = new ArrayList<RestoredFile>(); PDDocument document = PDDocument.load(copiedPdf); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode filesTree = namesDictionary.getEmbeddedFiles(); if (filesTree != null) { int i = 0; while (true) { PDComplexFileSpecification fileSpecification = (PDComplexFileSpecification) filesTree .getValue("PericlesMetadata-" + i); if (fileSpecification == null) { break; }//from www . j a v a 2 s . c o m File oldAttachedFile = new File(fileSpecification.getFile()); RestoredFile restoredPayload = new RestoredFile(RESTORED_DIRECTORY + oldAttachedFile.getName()); PDEmbeddedFile embeddedFile = fileSpecification.getEmbeddedFile(); InputStream inputStream = embeddedFile.createInputStream(); FileOutputStream outputStream = new FileOutputStream(restoredPayload); IOUtils.copy(inputStream, outputStream); removeBuggyLineEnding(restoredPayload); restoredPayload.wasPayload = true; restoredPayload.checksumValid = true; restoredPayload.restorationNote = "Checksum wasn't calculated, because this algorithm isn't using restoration metadata. The original payload file survives the encapsulation with this algorithm."; restoredFiles.add(restoredPayload); i++; } } document.close(); copiedPdf.wasCarrier = true; copiedPdf.checksumValid = false; copiedPdf.restorationNote = "Checksum can't be valid, because attached payload files can't be removed from carrier."; restoredFiles.add(copiedPdf); for (RestoredFile file : restoredFiles) { file.algorithm = this; for (RestoredFile relatedFile : restoredFiles) { if (file != relatedFile) { file.relatedFiles.add(relatedFile); } } } return restoredFiles; }
From source file:dev.ztgnrw.ExtractEmbeddedFiles.java
License:Apache License
private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) { // search for the first available alternative of the embedded file PDEmbeddedFile embeddedFile = null;//from ww w.j a v a 2 s . co m if (fileSpec != null) { embeddedFile = fileSpec.getEmbeddedFileUnicode(); if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFileDos(); } if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFileMac(); } if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFileUnix(); } if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFile(); } } return embeddedFile; }
From source file:eu.europa.ejusticeportal.dss.controller.signature.PdfUtils.java
License:EUPL
/** * Extracts an attachment from a PDF/*from w ww . ja v a 2 s.co m*/ * * @param pdf the PDF; this is unaffected by the method * @param name the name of the attachment * @return the attachment */ public static byte[] extractAttachment(final byte[] pdf, final String name) { PDDocument doc = null; try { InputStream is = new ByteArrayInputStream(pdf); doc = PDDocument.load(is); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); Map<String, COSObjectable> names = efTree.getNames(); byte[] data = null; if (names != null) { PDComplexFileSpecification attach = (PDComplexFileSpecification) names.get(name); data = attach.getEmbeddedFile().getByteArray(); } // Remove the \r\n added by PdfBox (PDDocument.saveIncremental). if (data != null) { String newString = new String(data, "UTF-8"); while (newString.endsWith("\r\n")) { newString = newString.replaceAll("\\r\\n$", ""); data = newString.getBytes(Charset.forName("UTF-8")); } } return data; } catch (IOException e) { LOGGER.error("Error detaching.", e); throw new SigningException(e); } finally { closeQuietly(doc); } }
From source file:io.konik.carriage.pdfbox.PDFBoxInvoiceExtractor.java
License:Open Source License
private static final InputStream extractZugferdXmlAttachment(PDEmbeddedFilesNameTreeNode embeddedFiles) throws IOException { PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) embeddedFiles.getValue(ZF_FILE_NAME); if (fileSpec == null) { throw new InvoiceExtractionError(NO_ZF_FILE + ZF_FILE_NAME); }/* w w w . ja va2s . co m*/ return fileSpec.getEmbeddedFile().createInputStream(); }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames, EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException { if (embeddedFileNames == null) { return;//from w ww. j a va2 s.c o m } for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) { PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); PDEmbeddedFile file = spec.getEmbeddedFile(); Metadata metadata = new Metadata(); // TODO: other metadata? metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); if (embeddedExtractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(file.createInputStream()); try { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } finally { stream.close(); } } } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
private void extractMultiOSPDEmbeddedFiles(String displayName, PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException, SAXException, TikaException { if (spec == null) { return;/* ww w . j a v a2 s. c om*/ } //current strategy is to pull all, not just first non-null extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes); }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
private void extractMultiOSPDEmbeddedFiles(String defaultName, PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException, SAXException, TikaException { if (spec == null) { return;//from ww w. jav a 2 s.com } //current strategy is to pull all, not just first non-null extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor); extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); }
From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java
License:Apache License
public boolean check() throws Exception { try (PDDocument document = PDDocument.load(in)) { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDPageable pageable = new PDPageable(document); PageFormat firstPage = pageable.getPageFormat(0); encrypted = document.isEncrypted(); pageCount = document.getNumberOfPages(); orientation = ORIENTATION_STRINGS[firstPage.getOrientation()]; version = String.valueOf(document.getDocument().getVersion()); String catalogVersion = catalog.getVersion(); if (catalogVersion != null && !catalogVersion.isEmpty()) { // According to specs version saved here should be determining instead // the version in header. It is barely used, though. version = catalogVersion;/*from www .j av a 2 s . c om*/ } if (!encrypted) { PDDocumentInformation metadata = document.getDocumentInformation(); author = metadata.getAuthor(); creationDate = metadata.getCreationDate(); creator = metadata.getCreator(); keywords = metadata.getKeywords(); modificationDate = metadata.getModificationDate(); producer = metadata.getProducer(); subject = metadata.getSubject(); title = metadata.getTitle(); } // extract all attached files from all pages int pageNumber = 0; for (Object page : catalog.getAllPages()) { pageNumber += 1; PdfPageMetadata pageMetadata = new PdfPageMetadata(); pageMetadata.setPageNumber(pageNumber); for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata(); PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); attachmentMetadata.setSubject(fann.getSubject()); attachmentMetadata.setName(fileSpec.getFilename()); attachmentMetadata.setCreationDate(embeddedFile.getCreationDate()); attachmentMetadata.setModificationDate(embeddedFile.getModDate()); attachmentMetadata.setMimeType(embeddedFile.getSubtype()); attachmentMetadata.setData(embeddedFile.getByteArray()); pageMetadata.addAttachment(attachmentMetadata); } } pages.add(pageMetadata); } return true; } }
From source file:org.mustangproject.ZUGFeRD.ZUGFeRDImporter.java
License:Open Source License
private void extractFiles(Map<String, PDComplexFileSpecification> names) throws IOException { for (String filename : names.keySet()) { /**//from w w w. j a va2 s .c o m * currently (in the release candidate of version 1) only one attached file with * the name ZUGFeRD-invoice.xml is allowed */ if ((filename.equals("ZUGFeRD-invoice.xml") || (filename.equals("zugferd-invoice.xml")) //$NON-NLS-1$ || filename.equals("factur-x.xml"))) { containsMeta = true; PDComplexFileSpecification fileSpec = names.get(filename); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); // String embeddedFilename = filePath + filename; // File file = new File(filePath + filename); // System.out.println("Writing " + embeddedFilename); // ByteArrayOutputStream fileBytes=new // ByteArrayOutputStream(); // FileOutputStream fos = new FileOutputStream(file); rawXML = embeddedFile.toByteArray(); setMeta(new String(rawXML)); // fos.write(embeddedFile.getByteArray()); // fos.close(); } if (filename.startsWith("additional_data")) { PDComplexFileSpecification fileSpec = names.get(filename); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); additionalXMLs.put(filename, embeddedFile.toByteArray()); } } }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract the content of embedded files from a PDF document. *///from ww w . ja v a2s . co m protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc) throws IOException { final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog(); if (pddDocCatalog == null) return; final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames(); if (nameDic == null) return; final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles(); if (embeddedFiles == null) return; @SuppressWarnings("unchecked") final Map<String, Object> names = embeddedFiles.getNames(); if (names == null || names.isEmpty()) return; final IParserContext context = this.contextLocal.getCurrentContext(); for (Entry<String, Object> name : names.entrySet()) { // final String fileDesc = name.getKey(); final Object fileObj = name.getValue(); if (fileObj == null) continue; if (fileObj instanceof PDComplexFileSpecification) { final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj; final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile(); // getting the embedded file name and mime-type final String fileName = embeddedFileSpec.getFile(); final String fileMimeType = embeddedFile.getSubtype(); if (fileMimeType == null) { this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location, fileName)); continue; } // getting a parser to parse the content final ISubParser sp = context.getParser(fileMimeType); if (sp == null) { this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.", location, fileName, fileMimeType)); continue; } // parsing content InputStream embeddedFileStream = null; try { embeddedFileStream = embeddedFile.createInputStream(); final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream); if (subParserDoc.getMimeType() == null) { subParserDoc.setMimeType(fileMimeType); } parserDoc.addSubDocument(fileName, subParserDoc); } catch (ParserException e) { this.logger.error(String.format( "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s", location, fileName, fileMimeType, e.getMessage())); } finally { if (embeddedFileStream != null) try { embeddedFileStream.close(); } catch (Exception e) { this.logger.error(e); } } } } }