List of usage examples for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile getSubtype
public String getSubtype()
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames, EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException { if (embeddedFileNames == null) { return;//from ww w .j ava 2 s . co m } for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) { PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); PDEmbeddedFile file = spec.getEmbeddedFile(); Metadata metadata = new Metadata(); // TODO: other metadata? metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); if (embeddedExtractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(file.createInputStream()); try { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } finally { stream.close(); } } } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName, PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException { if (file == null) { //skip silently return;//from ww w.j ava2 s . com } fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName; fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName; // TODO: other metadata? Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { return; } TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); } catch (IOException e) { //store this exception in the parent's metadata EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); return; } try { embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata, false); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); xhtml.startElement("div", attributes); xhtml.endElement("div"); } finally { IOUtils.closeQuietly(stream); } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException { if (file == null) { //skip silently return;//from ww w . j a v a 2 s. co m } fileName = (fileName == null) ? defaultName : fileName; // TODO: other metadata? Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); handler.startElement("div", attributes); handler.endElement("div"); } finally { IOUtils.closeQuietly(stream); } } }
From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java
License:Apache License
public boolean check() throws Exception { try (PDDocument document = PDDocument.load(in)) { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDPageable pageable = new PDPageable(document); PageFormat firstPage = pageable.getPageFormat(0); encrypted = document.isEncrypted(); pageCount = document.getNumberOfPages(); orientation = ORIENTATION_STRINGS[firstPage.getOrientation()]; version = String.valueOf(document.getDocument().getVersion()); String catalogVersion = catalog.getVersion(); if (catalogVersion != null && !catalogVersion.isEmpty()) { // According to specs version saved here should be determining instead // the version in header. It is barely used, though. version = catalogVersion;/*from w ww .java2 s .co m*/ } if (!encrypted) { PDDocumentInformation metadata = document.getDocumentInformation(); author = metadata.getAuthor(); creationDate = metadata.getCreationDate(); creator = metadata.getCreator(); keywords = metadata.getKeywords(); modificationDate = metadata.getModificationDate(); producer = metadata.getProducer(); subject = metadata.getSubject(); title = metadata.getTitle(); } // extract all attached files from all pages int pageNumber = 0; for (Object page : catalog.getAllPages()) { pageNumber += 1; PdfPageMetadata pageMetadata = new PdfPageMetadata(); pageMetadata.setPageNumber(pageNumber); for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata(); PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); attachmentMetadata.setSubject(fann.getSubject()); attachmentMetadata.setName(fileSpec.getFilename()); attachmentMetadata.setCreationDate(embeddedFile.getCreationDate()); attachmentMetadata.setModificationDate(embeddedFile.getModDate()); attachmentMetadata.setMimeType(embeddedFile.getSubtype()); attachmentMetadata.setData(embeddedFile.getByteArray()); pageMetadata.addAttachment(attachmentMetadata); } } pages.add(pageMetadata); } return true; } }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract the content of embedded files from a PDF document. *///www .java2 s.c o m protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc) throws IOException { final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog(); if (pddDocCatalog == null) return; final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames(); if (nameDic == null) return; final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles(); if (embeddedFiles == null) return; @SuppressWarnings("unchecked") final Map<String, Object> names = embeddedFiles.getNames(); if (names == null || names.isEmpty()) return; final IParserContext context = this.contextLocal.getCurrentContext(); for (Entry<String, Object> name : names.entrySet()) { // final String fileDesc = name.getKey(); final Object fileObj = name.getValue(); if (fileObj == null) continue; if (fileObj instanceof PDComplexFileSpecification) { final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj; final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile(); // getting the embedded file name and mime-type final String fileName = embeddedFileSpec.getFile(); final String fileMimeType = embeddedFile.getSubtype(); if (fileMimeType == null) { this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location, fileName)); continue; } // getting a parser to parse the content final ISubParser sp = context.getParser(fileMimeType); if (sp == null) { this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.", location, fileName, fileMimeType)); continue; } // parsing content InputStream embeddedFileStream = null; try { embeddedFileStream = embeddedFile.createInputStream(); final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream); if (subParserDoc.getMimeType() == null) { subParserDoc.setMimeType(fileMimeType); } parserDoc.addSubDocument(fileName, subParserDoc); } catch (ParserException e) { this.logger.error(String.format( "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s", location, fileName, fileMimeType, e.getMessage())); } finally { if (embeddedFileStream != null) try { embeddedFileStream.close(); } catch (Exception e) { this.logger.error(e); } } } } }