List of usage examples for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile createInputStream
public COSInputStream createInputStream() throws IOException
From source file:algorithm.PDFFileAttacher.java
License:Apache License
@Override public List<RestoredFile> restore(File originalPdf) throws IOException { RestoredFile copiedPdf = getRestoredCarrier(originalPdf); List<RestoredFile> restoredFiles = new ArrayList<RestoredFile>(); PDDocument document = PDDocument.load(copiedPdf); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode filesTree = namesDictionary.getEmbeddedFiles(); if (filesTree != null) { int i = 0; while (true) { PDComplexFileSpecification fileSpecification = (PDComplexFileSpecification) filesTree .getValue("PericlesMetadata-" + i); if (fileSpecification == null) { break; }/*from w ww . j av a 2 s .co m*/ File oldAttachedFile = new File(fileSpecification.getFile()); RestoredFile restoredPayload = new RestoredFile(RESTORED_DIRECTORY + oldAttachedFile.getName()); PDEmbeddedFile embeddedFile = fileSpecification.getEmbeddedFile(); InputStream inputStream = embeddedFile.createInputStream(); FileOutputStream outputStream = new FileOutputStream(restoredPayload); IOUtils.copy(inputStream, outputStream); removeBuggyLineEnding(restoredPayload); restoredPayload.wasPayload = true; restoredPayload.checksumValid = true; restoredPayload.restorationNote = "Checksum wasn't calculated, because this algorithm isn't using restoration metadata. The original payload file survives the encapsulation with this algorithm."; restoredFiles.add(restoredPayload); i++; } } document.close(); copiedPdf.wasCarrier = true; copiedPdf.checksumValid = false; copiedPdf.restorationNote = "Checksum can't be valid, because attached payload files can't be removed from carrier."; restoredFiles.add(copiedPdf); for (RestoredFile file : restoredFiles) { file.algorithm = this; for (RestoredFile relatedFile : restoredFiles) { if (file != relatedFile) { file.relatedFiles.add(relatedFile); } } } return restoredFiles; }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames, EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException { if (embeddedFileNames == null) { return;/*from w w w . j a v a2s . c om*/ } for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) { PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); PDEmbeddedFile file = spec.getEmbeddedFile(); Metadata metadata = new Metadata(); // TODO: other metadata? metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); if (embeddedExtractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(file.createInputStream()); try { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } finally { stream.close(); } } } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName, PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException { if (file == null) { //skip silently return;/* w w w. j a v a 2s . c o m*/ } fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName; fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName; // TODO: other metadata? Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { return; } TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); } catch (IOException e) { //store this exception in the parent's metadata EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); return; } try { embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata, false); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); xhtml.startElement("div", attributes); xhtml.endElement("div"); } finally { IOUtils.closeQuietly(stream); } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException { if (file == null) { //skip silently return;/*from ww w . jav a 2s .com*/ } fileName = (fileName == null) ? defaultName : fileName; // TODO: other metadata? Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); handler.startElement("div", attributes); handler.endElement("div"); } finally { IOUtils.closeQuietly(stream); } } }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract the content of embedded files from a PDF document. *//*from w ww . jav a 2 s .c o m*/ protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc) throws IOException { final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog(); if (pddDocCatalog == null) return; final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames(); if (nameDic == null) return; final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles(); if (embeddedFiles == null) return; @SuppressWarnings("unchecked") final Map<String, Object> names = embeddedFiles.getNames(); if (names == null || names.isEmpty()) return; final IParserContext context = this.contextLocal.getCurrentContext(); for (Entry<String, Object> name : names.entrySet()) { // final String fileDesc = name.getKey(); final Object fileObj = name.getValue(); if (fileObj == null) continue; if (fileObj instanceof PDComplexFileSpecification) { final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj; final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile(); // getting the embedded file name and mime-type final String fileName = embeddedFileSpec.getFile(); final String fileMimeType = embeddedFile.getSubtype(); if (fileMimeType == null) { this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location, fileName)); continue; } // getting a parser to parse the content final ISubParser sp = context.getParser(fileMimeType); if (sp == null) { this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.", location, fileName, fileMimeType)); continue; } // parsing content InputStream embeddedFileStream = null; try { embeddedFileStream = embeddedFile.createInputStream(); final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream); if (subParserDoc.getMimeType() == null) { subParserDoc.setMimeType(fileMimeType); } parserDoc.addSubDocument(fileName, subParserDoc); } catch (ParserException e) { this.logger.error(String.format( "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s", location, fileName, fileMimeType, e.getMessage())); } finally { if (embeddedFileStream != null) try { embeddedFileStream.close(); } catch (Exception e) { this.logger.error(e); } } } } }