List of usage examples for org.apache.poi.openxml4j.opc PackagePart getContentType
public String getContentType()
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
private void handleEmbeddedParts(ContentHandler handler) throws TikaException, IOException, SAXException { try {// www. ja v a 2s . c o m for (PackagePart source : getMainDocumentParts()) { for (PackageRelationship rel : source.getRelationships()) { URI sourceURI = rel.getSourceURI(); String sourceDesc; if (sourceURI != null) { sourceDesc = getJustFileName(sourceURI.getPath()); if (sourceDesc.startsWith("slide")) { sourceDesc += "_"; } else { sourceDesc = ""; } } else { sourceDesc = ""; } if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePart target; try { target = source.getRelatedPart(rel); } catch (IllegalArgumentException ex) { continue; } String type = rel.getRelationshipType(); if (RELATION_OLE_OBJECT.equals(type) && TYPE_OLE_OBJECT.equals(target.getContentType())) { handleEmbeddedOLE(target, handler, sourceDesc + rel.getId()); } else if (RELATION_AUDIO.equals(type) || RELATION_IMAGE.equals(type) || RELATION_PACKAGE.equals(type) || RELATION_OLE_OBJECT.equals(type)) { handleEmbeddedFile(target, handler, sourceDesc + rel.getId()); } } } } } catch (InvalidFormatException e) { throw new TikaException("Broken OOXML file", e); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded file in the document *///from w ww. ja va 2 s. co m protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException { Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); // Get the name String name = part.getPartName().getName(); metadata.set(Metadata.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1)); // Get the content type metadata.set(Metadata.CONTENT_TYPE, part.getContentType()); // Call the recursing handler if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(TikaInputStream.get(part.getInputStream()), new EmbeddedContentHandler(handler), metadata, false); } }
From source file:mj.ocraptor.extraction.tika.parser.pkg.ZipContainerDetector.java
License:Apache License
/** * Detects the type of an OfficeOpenXML (OOXML) file from * opened Package /* w w w .j ava 2 s . c o m*/ */ public static MediaType detectOfficeOpenXML(OPCPackage pkg) { PackageRelationshipCollection core = pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL); if (core.size() != 1) { // Invalid OOXML Package received return null; } // Get the type of the core document part PackagePart corePart = pkg.getPart(core.getRelationship(0)); String coreType = corePart.getContentType(); // Turn that into the type of the overall document String docType = coreType.substring(0, coreType.lastIndexOf('.')); // The Macro Enabled formats are a little special if (docType.toLowerCase().endsWith("macroenabled")) { docType = docType.toLowerCase() + ".12"; } if (docType.toLowerCase().endsWith("macroenabledtemplate")) { docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12"); } // Build the MediaType object and return return MediaType.parse(docType); }
From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
private void handleThumbnail(ContentHandler handler) { try {// w ww. j a va 2 s.c o m OPCPackage opcPackage = extractor.getPackage(); for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) { PackagePart tPart = opcPackage.getPart(rel); InputStream tStream = tPart.getInputStream(); Metadata thumbnailMetadata = new Metadata(); String thumbName = tPart.getPartName().getName(); thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName); handler.startElement(XHTML, "div", "div", attributes); handler.endElement(XHTML, "div", "div"); thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName); thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType()); thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName()); if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) { embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false); } tStream.close(); } } catch (Exception ex) { } }
From source file:org.apache.tika.parser.pkg.ZipContainerDetector.java
License:Apache License
/** * Detects the type of an OfficeOpenXML (OOXML) file from * opened Package /* ww w. j a v a 2 s . c o m*/ */ public static MediaType detectOfficeOpenXML(OPCPackage pkg) { // Check for the normal Office core document PackageRelationshipCollection core = pkg.getRelationshipsByType(PackageRelationshipTypes.CORE_DOCUMENT); // Otherwise check for some other Office core document types if (core.size() == 0) { core = pkg.getRelationshipsByType(STRICT_CORE_DOCUMENT); } if (core.size() == 0) { core = pkg.getRelationshipsByType(VISIO_DOCUMENT); } // If we didn't find a single core document of any type, skip detection if (core.size() != 1) { // Invalid OOXML Package received return null; } // Get the type of the core document part PackagePart corePart = pkg.getPart(core.getRelationship(0)); String coreType = corePart.getContentType(); // Turn that into the type of the overall document String docType = coreType.substring(0, coreType.lastIndexOf('.')); // The Macro Enabled formats are a little special if (docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) { docType = docType.toLowerCase(Locale.ROOT) + ".12"; } if (docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) { docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12"); } // Build the MediaType object and return return MediaType.parse(docType); }
From source file:poi.xslf.usermodel.DataExtraction.java
License:Apache License
public static void main(String args[]) throws Exception { if (args.length == 0) { System.out.println("Input file is required"); return;/*from w w w . j a v a 2 s . c om*/ } FileInputStream is = new FileInputStream(args[0]); XMLSlideShow ppt = new XMLSlideShow(is); is.close(); // Get the document's embedded files. List<PackagePart> embeds = ppt.getAllEmbedds(); for (PackagePart p : embeds) { String type = p.getContentType(); String name = p.getPartName().getName(); //typically file name InputStream pIs = p.getInputStream(); // make sense of the part data pIs.close(); } // Get the document's embedded files. List<XSLFPictureData> images = ppt.getAllPictures(); for (XSLFPictureData data : images) { PackagePart p = data.getPackagePart(); String type = p.getContentType(); String name = data.getFileName(); InputStream pIs = p.getInputStream(); // make sense of the image data pIs.close(); } Dimension pageSize = ppt.getPageSize(); // size of the canvas in points for (XSLFSlide slide : ppt.getSlides()) { for (XSLFShape shape : slide) { Rectangle2D anchor = shape.getAnchor(); // position on the canvas if (shape instanceof XSLFTextShape) { XSLFTextShape txShape = (XSLFTextShape) shape; System.out.println(txShape.getText()); } else if (shape instanceof XSLFPictureShape) { XSLFPictureShape pShape = (XSLFPictureShape) shape; XSLFPictureData pData = pShape.getPictureData(); System.out.println(pData.getFileName()); } else { System.out.println("Process me: " + shape.getClass()); } } } }
From source file:poi.xssf.usermodel.examples.EmbeddedObjects.java
License:Apache License
public static void main(String[] args) throws Exception { OPCPackage pkg = OPCPackage.open(args[0]); XSSFWorkbook workbook = new XSSFWorkbook(pkg); for (PackagePart pPart : workbook.getAllEmbedds()) { String contentType = pPart.getContentType(); // Excel Workbook - either binary or OpenXML if (contentType.equals("application/vnd.ms-excel")) { HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream()); }// w ww .ja v a 2s . c om // Excel Workbook - OpenXML file format else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(pPart.getInputStream()); } // Word Document - binary (OLE2CDF) file format else if (contentType.equals("application/msword")) { HWPFDocument document = new HWPFDocument(pPart.getInputStream()); } // Word Document - OpenXML file format else if (contentType .equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { XWPFDocument document = new XWPFDocument(pPart.getInputStream()); } // PowerPoint Document - binary file format else if (contentType.equals("application/vnd.ms-powerpoint")) { HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream()); } // PowerPoint Document - OpenXML file format else if (contentType .equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) { OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); XSLFSlideShow slideShow = new XSLFSlideShow(docPackage); } // Any other type of embedded object. else { System.out.println("Unknown Embedded Document: " + contentType); InputStream inputStream = pPart.getInputStream(); } } pkg.close(); }
From source file:test.unit.be.fedict.eid.applet.service.signer.OOXMLSignatureVerifierTest.java
License:Open Source License
@Test public void testOPC() throws Exception { // setup//from w ww. ja va2 s . c om InputStream inputStream = OOXMLSignatureVerifierTest.class.getResourceAsStream("/hello-world-signed.docx"); // operate OPCPackage opcPackage = OPCPackage.open(inputStream); ArrayList<PackagePart> parts = opcPackage.getParts(); for (PackagePart part : parts) { LOG.debug("part name: " + part.getPartName().getName()); LOG.debug("part content type: " + part.getContentType()); } ArrayList<PackagePart> signatureParts = opcPackage .getPartsByContentType("application/vnd.openxmlformats-package.digital-signature-xmlsignature+xml"); assertFalse(signatureParts.isEmpty()); PackagePart signaturePart = signatureParts.get(0); LOG.debug("signature part class type: " + signaturePart.getClass().getName()); PackageDigitalSignatureManager packageDigitalSignatureManager = new PackageDigitalSignatureManager(); // yeah... POI implementation still missing }
From source file:uk.ac.liverpool.spreadsheet.ExcelFeatureAnalysis.java
License:Apache License
private static void analyseSpreadsheet(Element da, ExcelFeatureAnalysis efa) { Element s = new Element("spreadsheets", sn); da.addContent(s);/* www . j a va 2 s. co m*/ s.setAttribute("numberOfSheets", "" + efa.wb.getNumberOfSheets()); // workbook wide features List<? extends PictureData> allPictures = efa.wb.getAllPictures(); if (allPictures != null && allPictures.size() > 0) { Element oo = new Element("Pictures", sn); s.addContent(oo); for (PictureData pd : allPictures) { Element ob = new Element("Picture", sn); ob.setAttribute("mimeType", pd.getMimeType()); oo.addContent(ob); } } int numfonts = efa.wb.getNumberOfFonts(); if (numfonts > 0) { Element oo = new Element("Fonts", sn); s.addContent(oo); for (int i = 0; i < numfonts; i++) { Font cs = efa.wb.getFontAt((short) i); Element ob = new Element("Font", sn); ob.setAttribute("Name", cs.getFontName()); ob.setAttribute("Charset", "" + cs.getCharSet()); oo.addContent(ob); } } if (efa.hswb != null) { DocumentSummaryInformation dsi = efa.hswb.getDocumentSummaryInformation(); if (dsi != null) s.setAttribute("OSVersion", "" + dsi.getOSVersion()); // Property[] properties = dsi.getProperties(); // CustomProperties customProperties = dsi.getCustomProperties(); List<HSSFObjectData> eo = efa.hswb.getAllEmbeddedObjects(); if (eo != null && eo.size() > 0) { Element oo = new Element("EmbeddedObjects", sn); s.addContent(oo); for (HSSFObjectData o : eo) { Element ob = new Element("EmbeddedObject", sn); ob.setAttribute("name", o.getOLE2ClassName()); oo.addContent(ob); } } } else if (efa.xswb != null) { try { POIXMLProperties properties = efa.xswb.getProperties(); List<PackagePart> allEmbedds = efa.xswb.getAllEmbedds(); if (allEmbedds != null && allEmbedds.size() > 0) { Element oo = new Element("EmbeddedObjects", sn); s.addContent(oo); for (PackagePart p : allEmbedds) { Element ob = new Element("EmbeddedObject", sn); ob.setAttribute("mimeType", p.getContentType()); ob.setAttribute("name", p.getPartName().getName()); oo.addContent(ob); } } } catch (OpenXML4JException e) { // TODO Auto-generated catch block e.printStackTrace(); } } int nn = efa.wb.getNumberOfNames(); if (nn > 0) { Element oo = new Element("NamedCells", sn); s.addContent(oo); } // sheet specific features int total = efa.wb.getNumberOfSheets(); for (int c = 0; c < total; c++) { Sheet sheet = efa.wb.getSheetAt(c); Element single = new Element("sheet", sn); s.addContent(single); analyseSheet(sheet, single, sn, efa); } }