List of usage examples for org.apache.poi.poifs.filesystem DirectoryNode hasEntry
public boolean hasEntry(String name)
From source file:com.ezdi.rtf.testRTFParser.RTFObjDataParser.java
License:Apache License
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { byte[] ret = null; try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) { DirectoryNode root = fs.getRoot(); if (root == null) { return ret; }/*from w w w. ja v a 2s.co m*/ if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) { ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } return ret; }
From source file:com.jkoolcloud.tnt4j.streams.inputs.ExcelSXSSFRowStream.java
License:Apache License
/** * Reads HSSF (XLS) format excel file using Apache POI streaming SXSSF API. * * @param xlsFile//w w w .j av a 2s.co m * excel HSSF format file to read * * @throws IOException * if excel file or workbook can't be read */ protected void readXLS(File xlsFile) throws IOException { NPOIFSFileSystem fs = null; InputStream dis = null; boolean passwordSet = false; try { fs = new NPOIFSFileSystem(xlsFile, true); DirectoryNode root = fs.getRoot(); if (root.hasEntry("EncryptedPackage")) { // NON-NLS dis = DocumentFactoryHelper.getDecryptedStream(fs, wbPass); } else { if (wbPass != null) { Biff8EncryptionKey.setCurrentUserPassword(wbPass); passwordSet = true; } dis = fs.createDocumentInputStream("Workbook"); // NON-NLS } HSSFRequest req = new HSSFRequest(); XLSEventListener listener = new XLSEventListener(this); FormatTrackingHSSFListener formatsListener = new FormatTrackingHSSFListener(listener, Locale.getDefault()); listener.setFormatListener(formatsListener); req.addListenerForAllRecords(formatsListener); HSSFEventFactory factory = new HSSFEventFactory(); factory.processEvents(req, dis); } finally { if (passwordSet) { Biff8EncryptionKey.setCurrentUserPassword((String) null); } Utils.close(fs); Utils.close(dis); } }
From source file:com.kplot.web.data.WorkbookFactory.java
License:Apache License
/** * Creates a Workbook from the given NPOIFSFileSystem, which may * be password protected// w w w . j a v a 2 s . co m * * @param fs The {@link NPOIFSFileSystem} to read the document from * @param password The password that should be used or null if no password is necessary. * * @return The created Workbook * * @throws IOException if an error occurs while reading the data * @throws InvalidFormatException if the contents of the file cannot be parsed into a {@link Workbook} */ private static Workbook create(NPOIFSFileSystem fs, String password) throws IOException, InvalidFormatException { DirectoryNode root = fs.getRoot(); // Encrypted OOXML files go inside OLE2 containers, is this one? if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { EncryptionInfo info = new EncryptionInfo(fs); Decryptor d = Decryptor.getInstance(info); boolean passwordCorrect = false; InputStream stream = null; try { if (password != null && d.verifyPassword(password)) { passwordCorrect = true; } if (!passwordCorrect && d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) { passwordCorrect = true; } if (passwordCorrect) { stream = d.getDataStream(root); } } catch (GeneralSecurityException e) { throw new IOException(e); } if (!passwordCorrect) { if (password != null) throw new EncryptedDocumentException("Password incorrect"); else throw new EncryptedDocumentException( "The supplied spreadsheet is protected, but no password was supplied"); } OPCPackage pkg = OPCPackage.open(stream); return create(pkg); } // If we get here, it isn't an encrypted XLSX file // So, treat it as a regular HSSF XLS one if (password != null) { Biff8EncryptionKey.setCurrentUserPassword(password); } try { return new HSSFWorkbook(root, true); } finally { Biff8EncryptionKey.setCurrentUserPassword(null); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ExcelExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { if (!root.hasEntry(WORKBOOK_ENTRY)) { // Corrupt file / very old file, just skip return;/*from www . ja va 2 s . c o m*/ } TikaImageHelper tikaHelper = new TikaImageHelper(metadata); try { TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this, tikaHelper); listener.processFile(root, isListenForAllRecords()); listener.throwStoredException(); tikaHelper.addTextToHandler(xhtml); } catch (Exception e) { // TODO: logging e.printStackTrace(); } finally { if (tikaHelper != null) { tikaHelper.close(); } } for (Entry entry : root) { if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) { try { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } catch (TikaException e) { // ignore parse errors from embedded documents } } } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *///from w ww .j a va 2s. c om private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code // if (part.getSize() >= 0 && part.getSize() < 512*3) { // // Too small, skip // return; // } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:org.apache.tika.parser.microsoft.ExcelExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { if (!root.hasEntry(WORKBOOK_ENTRY)) { if (root.hasEntry(BOOK_ENTRY)) { // Excel 5 / Excel 95 file // Records are in a different structure so needs a // different parser to process them OldExcelExtractor extractor = new OldExcelExtractor(root); OldExcelParser.parse(extractor, xhtml); return; } else {//from ww w .java2 s. c om // Corrupt file / very old file, just skip text extraction return; } } // If a password was supplied, use it, otherwise the default Biff8EncryptionKey.setCurrentUserPassword(getPassword()); // Have the file processed in event mode TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this); listener.processFile(root, isListenForAllRecords()); listener.throwStoredException(); for (Entry entry : root) { if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) { try { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } catch (TikaException e) { // ignore parse errors from embedded documents } } } }
From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *//*from www. ja v a2 s.c o m*/ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip return; } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); } byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:org.apache.tika.parser.rtf.RTFObjDataParser.java
License:Apache License
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { byte[] ret = null; try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) { DirectoryNode root = fs.getRoot(); if (root == null) { return ret; }//from w w w .j a va 2 s . co m if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { //try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) { ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } return ret; }
From source file:org.apache.tika.parser.wordperfect.QPWTextExtractor.java
License:Apache License
@SuppressWarnings("resource") public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException { POIFSFileSystem pfs = new POIFSFileSystem(input); DirectoryNode rootNode = pfs.getRoot(); if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) { throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames()); }/*from w ww. j a v a 2 s. c om*/ //TODO shall we validate and throw warning/error if the file does not //start with a BOF and ends with a EOF? xhtml.startElement("p"); try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) { Context ctx = new Context(in, xhtml, metadata); while (hasNext(in)) { ctx.type = in.readWPShort(); ctx.bodyLength = in.readWPShort(); Extractor extractor = EXTRACTORS.get(ctx.type); if (extractor != null) { extractor.extract(ctx); } else { // Use DEBUG to find out what we are ignoring // Extractor.DEBUG.extract(ctx); Extractor.IGNORE.extract(ctx); } } } xhtml.endElement("p"); }