List of usage examples for org.apache.poi.poifs.filesystem DirectoryEntry hasEntry
public boolean hasEntry(final String name);
From source file:com.auxilii.msgparser.MsgParser.java
License:Open Source License
/** * Creates an {@link Attachment} object based on * the given directory entry. The entry may either * point to an attached file or to an//ww w . j a v a 2s.c om * attached .msg file, which will be added * as a {@link MsgAttachment} object instead. * * @param dir The directory entry containing the attachment * document entry and some other document entries * describing the attachment (name, extension, mime type, ...) * @param msg The {@link Message} object that this * attachment should be added to. * @throws IOException Thrown if the attachment could * not be parsed/read. */ protected void parseAttachment(DirectoryEntry dir, Message msg) throws IOException { if (dir.hasEntry("__substg1.0_3701000D")) { parseEmbeddedMessage(dir, msg); } else { ParseFileAttachment(dir, msg); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.AbstractPOIFSExtractor.java
License:Apache License
/** * Handle an office document that's embedded at the POIFS level */// ww w. j ava2s . c o m protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Is it an embedded OLE2 document, or an embedded OOXML document? if (dir.hasEntry("Package")) { // It's OOXML (has a ZipFile): Entry ooxml = dir.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); try { ZipContainerDetector detector = new ZipContainerDetector(); MediaType type = detector.detect(stream, new Metadata()); handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true); return; } finally { stream.close(); } } // It's regular OLE2: // What kind of document is it? Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); POIFSDocumentType type = POIFSDocumentType.detectType(dir); TikaInputStream embedded = null; try { if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); byte[] data = ole.getDataBuffer(); embedded = TikaInputStream.get(data); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } catch (Exception e) { LOGGER.warn( "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e); } } else if (type == POIFSDocumentType.COMP_OBJ) { try { // Grab the contents and process DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) dir.getEntry("Contents"); } DocumentInputStream inp = new DocumentInputStream(contentsEntry); byte[] contents = new byte[contentsEntry.getSize()]; inp.readFully(contents); embedded = TikaInputStream.get(contents); // Try to work out what it is MediaType mediaType = getDetector().detect(embedded, new Metadata()); String extension = type.getExtension(); try { MimeType mimeType = getMimeTypes().forName(mediaType.toString()); extension = mimeType.getExtension(); } catch (MimeTypeException mte) { // No details on this type are known } // Record what we can do about it metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); } catch (Exception e) { throw new TikaException("Invalid embedded resource", e); } } else { metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension()); } // Should we parse it? if (extractor.shouldParseEmbedded(metadata)) { if (embedded == null) { // Make a TikaInputStream that just // passes the root directory of the // embedded document, and is otherwise // empty (byte[0]): embedded = TikaInputStream.get(new byte[0]); embedded.setOpenContainer(dir); } extractor.parseEmbedded(embedded, xhtml, metadata, true); } } finally { if (embedded != null) { embedded.close(); } } }
From source file:org.apache.tika.parser.microsoft.AbstractPOIFSExtractor.java
License:Apache License
/** * Handle an office document that's embedded at the POIFS level *//*from ww w . j a v a 2 s .co m*/ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Is it an embedded OLE2 document, or an embedded OOXML document? if (dir.hasEntry("Package")) { // It's OOXML (has a ZipFile): Entry ooxml = dir.getEntry("Package"); try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) { ZipContainerDetector detector = new ZipContainerDetector(); MediaType type = detector.detect(stream, new Metadata()); handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true); return; } } // It's regular OLE2: // What kind of document is it? Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); if (dir.getStorageClsid() != null) { metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString()); } POIFSDocumentType type = POIFSDocumentType.detectType(dir); TikaInputStream embedded = null; try { if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); } byte[] data = ole.getDataBuffer(); embedded = TikaInputStream.get(data); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } catch (Exception e) { logger.warn( "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e); } } else if (type == POIFSDocumentType.COMP_OBJ) { try { // Grab the contents and process DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) dir.getEntry("Contents"); } DocumentInputStream inp = new DocumentInputStream(contentsEntry); byte[] contents = new byte[contentsEntry.getSize()]; inp.readFully(contents); embedded = TikaInputStream.get(contents); // Try to work out what it is MediaType mediaType = getDetector().detect(embedded, new Metadata()); String extension = type.getExtension(); try { MimeType mimeType = getMimeTypes().forName(mediaType.toString()); extension = mimeType.getExtension(); } catch (MimeTypeException mte) { // No details on this type are known } // Record what we can do about it metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); } catch (Exception e) { throw new TikaException("Invalid embedded resource", e); } } else { metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension()); } // Should we parse it? if (extractor.shouldParseEmbedded(metadata)) { if (embedded == null) { // Make a TikaInputStream that just // passes the root directory of the // embedded document, and is otherwise // empty (byte[0]): embedded = TikaInputStream.get(new byte[0]); embedded.setOpenContainer(dir); } extractor.parseEmbedded(embedded, xhtml, metadata, true); } } finally { if (embedded != null) { embedded.close(); } } }