List of usage examples for org.apache.poi.hssf.extractor OldExcelExtractor OldExcelExtractor
public OldExcelExtractor(DirectoryNode directory) throws IOException
From source file:org.apache.tika.parser.microsoft.ExcelExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { if (!root.hasEntry(WORKBOOK_ENTRY)) { if (root.hasEntry(BOOK_ENTRY)) { // Excel 5 / Excel 95 file // Records are in a different structure so needs a // different parser to process them OldExcelExtractor extractor = new OldExcelExtractor(root); OldExcelParser.parse(extractor, xhtml); return; } else {/*from w w w .j a v a 2 s .c o m*/ // Corrupt file / very old file, just skip text extraction return; } } // If a password was supplied, use it, otherwise the default Biff8EncryptionKey.setCurrentUserPassword(getPassword()); // Have the file processed in event mode TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this); listener.processFile(root, isListenForAllRecords()); listener.throwStoredException(); for (Entry entry : root) { if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) { try { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } catch (TikaException e) { // ignore parse errors from embedded documents } } } }
From source file:org.apache.tika.parser.microsoft.OldExcelParser.java
License:Apache License
/** * Extracts properties and text from an MS Document input stream *//* www.ja v a 2s.co m*/ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Open the POI provided extractor OldExcelExtractor extractor = new OldExcelExtractor(stream); // We can't do anything about metadata, as these old formats // didn't have any stored with them // Set the content type // TODO Get the version and type, to set as the Content Type // Have the text extracted and given to our Content Handler XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); parse(extractor, xhtml); }