List of usage examples for org.apache.poi.poifs.filesystem Ole10Native getLabel
public String getLabel()
From source file:mj.ocraptor.extraction.tika.parser.microsoft.AbstractPOIFSExtractor.java
License:Apache License
/** * Handle an office document that's embedded at the POIFS level *//*from w ww . j a v a 2s. c om*/ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Is it an embedded OLE2 document, or an embedded OOXML document? if (dir.hasEntry("Package")) { // It's OOXML (has a ZipFile): Entry ooxml = dir.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); try { ZipContainerDetector detector = new ZipContainerDetector(); MediaType type = detector.detect(stream, new Metadata()); handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true); return; } finally { stream.close(); } } // It's regular OLE2: // What kind of document is it? Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); POIFSDocumentType type = POIFSDocumentType.detectType(dir); TikaInputStream embedded = null; try { if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); byte[] data = ole.getDataBuffer(); embedded = TikaInputStream.get(data); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } catch (Exception e) { LOGGER.warn( "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e); } } else if (type == POIFSDocumentType.COMP_OBJ) { try { // Grab the contents and process DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) dir.getEntry("Contents"); } DocumentInputStream inp = new DocumentInputStream(contentsEntry); byte[] contents = new byte[contentsEntry.getSize()]; inp.readFully(contents); embedded = TikaInputStream.get(contents); // Try to work out what it is MediaType mediaType = getDetector().detect(embedded, new Metadata()); String extension = type.getExtension(); try { MimeType mimeType = getMimeTypes().forName(mediaType.toString()); extension = mimeType.getExtension(); } catch (MimeTypeException mte) { // No details on this type are known } // Record what we can do about it metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); } catch (Exception e) { throw new TikaException("Invalid embedded resource", e); } } else { metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension()); } // Should we parse it? if (extractor.shouldParseEmbedded(metadata)) { if (embedded == null) { // Make a TikaInputStream that just // passes the root directory of the // embedded document, and is otherwise // empty (byte[0]): embedded = TikaInputStream.get(new byte[0]); embedded.setOpenContainer(dir); } extractor.parseEmbedded(embedded, xhtml, metadata, true); } } finally { if (embedded != null) { embedded.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *///from w w w. jav a 2s. c o m private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code // if (part.getSize() >= 0 && part.getSize() < 512*3) { // // Too small, skip // return; // } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:org.apache.tika.parser.microsoft.AbstractPOIFSExtractor.java
License:Apache License
/** * Handle an office document that's embedded at the POIFS level *///from ww w. j a v a 2s . c o m protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { // Is it an embedded OLE2 document, or an embedded OOXML document? if (dir.hasEntry("Package")) { // It's OOXML (has a ZipFile): Entry ooxml = dir.getEntry("Package"); try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) { ZipContainerDetector detector = new ZipContainerDetector(); MediaType type = detector.detect(stream, new Metadata()); handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true); return; } } // It's regular OLE2: // What kind of document is it? Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); if (dir.getStorageClsid() != null) { metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString()); } POIFSDocumentType type = POIFSDocumentType.detectType(dir); TikaInputStream embedded = null; try { if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel()); } byte[] data = ole.getDataBuffer(); embedded = TikaInputStream.get(data); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } catch (Exception e) { logger.warn( "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e); } } else if (type == POIFSDocumentType.COMP_OBJ) { try { // Grab the contents and process DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) dir.getEntry("Contents"); } DocumentInputStream inp = new DocumentInputStream(contentsEntry); byte[] contents = new byte[contentsEntry.getSize()]; inp.readFully(contents); embedded = TikaInputStream.get(contents); // Try to work out what it is MediaType mediaType = getDetector().detect(embedded, new Metadata()); String extension = type.getExtension(); try { MimeType mimeType = getMimeTypes().forName(mediaType.toString()); extension = mimeType.getExtension(); } catch (MimeTypeException mte) { // No details on this type are known } // Record what we can do about it metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension); } catch (Exception e) { throw new TikaException("Invalid embedded resource", e); } } else { metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension()); } // Should we parse it? if (extractor.shouldParseEmbedded(metadata)) { if (embedded == null) { // Make a TikaInputStream that just // passes the root directory of the // embedded document, and is otherwise // empty (byte[0]): embedded = TikaInputStream.get(new byte[0]); embedded.setOpenContainer(dir); } extractor.parseEmbedded(embedded, xhtml, metadata, true); } } finally { if (embedded != null) { embedded.close(); } } }
From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *//*from w w w.j a v a2s .co m*/ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip return; } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); } byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }