List of usage examples for org.apache.pdfbox.pdmodel.interactive.annotation PDAnnotationFileAttachment getFile
public PDFileSpecification getFile() throws IOException
From source file:dev.ztgnrw.ExtractEmbeddedFiles.java
License:Apache License
/** * This is the main method./*from w w w . j a v a2 s . c o m*/ * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. */ public static void extractEmbeddedFiles(String file) throws IOException { PDDocument document = null; try { File pdfFile = new File(file); String filePath = pdfFile.getParent() + System.getProperty("file.separator"); document = PDDocument.load(pdfFile); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); if (efTree != null) { Map<String, PDComplexFileSpecification> names = efTree.getNames(); if (names != null) { extractFiles(names, filePath); } else { List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { names = node.getNames(); extractFiles(names, filePath); } } } // extract files from annotations for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment .getFile(); PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec); extractFile(filePath, fileSpec.getFilename(), embeddedFile); } } } } finally { if (document != null) { document.close(); } } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {// w w w . j av a 2 s .co m for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { //can't currently associate link to text. //for now, extract link and repeat the link as if it //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTMLPureJava.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {//from w ww. j ava 2s .co m for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { //can't currently associate link to text. //for now, extract link and repeat the link as if it //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {// w ww . j a v a2s . c o m writeParagraphEnd(); extractImages(page.getResources()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {/* w ww .j av a 2 s .c om*/ writeParagraphEnd(); extractImages(page.getResources(), new HashSet<COSBase>()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } page.clear(); }
From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java
License:Apache License
public boolean check() throws Exception { try (PDDocument document = PDDocument.load(in)) { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDPageable pageable = new PDPageable(document); PageFormat firstPage = pageable.getPageFormat(0); encrypted = document.isEncrypted(); pageCount = document.getNumberOfPages(); orientation = ORIENTATION_STRINGS[firstPage.getOrientation()]; version = String.valueOf(document.getDocument().getVersion()); String catalogVersion = catalog.getVersion(); if (catalogVersion != null && !catalogVersion.isEmpty()) { // According to specs version saved here should be determining instead // the version in header. It is barely used, though. version = catalogVersion;//from w w w . j a va 2 s . c o m } if (!encrypted) { PDDocumentInformation metadata = document.getDocumentInformation(); author = metadata.getAuthor(); creationDate = metadata.getCreationDate(); creator = metadata.getCreator(); keywords = metadata.getKeywords(); modificationDate = metadata.getModificationDate(); producer = metadata.getProducer(); subject = metadata.getSubject(); title = metadata.getTitle(); } // extract all attached files from all pages int pageNumber = 0; for (Object page : catalog.getAllPages()) { pageNumber += 1; PdfPageMetadata pageMetadata = new PdfPageMetadata(); pageMetadata.setPageNumber(pageNumber); for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata(); PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); attachmentMetadata.setSubject(fann.getSubject()); attachmentMetadata.setName(fileSpec.getFilename()); attachmentMetadata.setCreationDate(embeddedFile.getCreationDate()); attachmentMetadata.setModificationDate(embeddedFile.getModDate()); attachmentMetadata.setMimeType(embeddedFile.getSubtype()); attachmentMetadata.setData(embeddedFile.getByteArray()); pageMetadata.addAttachment(attachmentMetadata); } } pages.add(pageMetadata); } return true; } }
From source file:org.pdfmetamodifier.IOHelper.java
License:Apache License
/** * Save all Attached (embedded) files to some directory. * //from w w w . java 2s .c o m * @param pdfFile * Source PDF file. * @param outputDir * Target directory. * @throws IOException */ /* * See: * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java?view=markup */ public static void saveAttachments(final File pdfFile, final File outputDir) throws IOException { PDDocument document = null; try { // Read PDF file. document = PDDocument.load(pdfFile); if (document.isEncrypted()) { throw new IOException("Document is encrypted."); } // Extract Embedded (attached) files. final PDDocumentNameDictionary documentNameDictionary = new PDDocumentNameDictionary( document.getDocumentCatalog()); final PDEmbeddedFilesNameTreeNode embeddedFilesNameTree = documentNameDictionary.getEmbeddedFiles(); if (embeddedFilesNameTree != null) { extractFiles(outputDir, embeddedFilesNameTree.getNames()); final List<PDNameTreeNode<PDComplexFileSpecification>> kids = embeddedFilesNameTree.getKids(); if (kids != null) { for (PDNameTreeNode<PDComplexFileSpecification> nameTreeNode : kids) { extractFiles(outputDir, nameTreeNode.getNames()); } } } // Extract Embedded (attached) from annotations. for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { final PDAnnotationFileAttachment fileAttach = (PDAnnotationFileAttachment) annotation; final PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fileAttach .getFile(); extractFile(outputDir, fileSpec); } } } } finally { if (document != null) { document.close(); } } }