List of usage examples for org.apache.poi.poifs.filesystem POIFSFileSystem createDocumentInputStream
public DocumentInputStream createDocumentInputStream(final String documentName) throws IOException
From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *///from w w w . j a v a2 s . c o m private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip return; } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); if (ole.getLabel() != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); } byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:org.apache.tika.parser.wordperfect.QPWTextExtractor.java
License:Apache License
@SuppressWarnings("resource") public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException { POIFSFileSystem pfs = new POIFSFileSystem(input); DirectoryNode rootNode = pfs.getRoot(); if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) { throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames()); }//from w ww . j a v a2 s. c o m //TODO shall we validate and throw warning/error if the file does not //start with a BOF and ends with a EOF? xhtml.startElement("p"); try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) { Context ctx = new Context(in, xhtml, metadata); while (hasNext(in)) { ctx.type = in.readWPShort(); ctx.bodyLength = in.readWPShort(); Extractor extractor = EXTRACTORS.get(ctx.type); if (extractor != null) { extractor.extract(ctx); } else { // Use DEBUG to find out what we are ignoring // Extractor.DEBUG.extract(ctx); Extractor.IGNORE.extract(ctx); } } } xhtml.endElement("p"); }
From source file:org.jab.docsearch.converters.Excel.java
License:Open Source License
/** * @see ConverterInterface#parse()//w ww . j ava 2s. c o m */ @Override public void parse() throws ConverterException { if (filename == null) { log.error("parse() filename is null"); throw new ConverterException("Word::parse() filename is null"); } // get meta data FileInputStream fin = null; try { fin = new FileInputStream(filename); POIFSReader r = new POIFSReader(); MyPOIFSReaderListener mpfsrl = new MyPOIFSReaderListener(); r.registerListener(mpfsrl, "\005SummaryInformation"); r.read(fin); fin.close(); // get meta data documentTitle = mpfsrl.getTitle(); documentAuthor = mpfsrl.getAuthor(); documentKeywords = mpfsrl.getKeywords(); } catch (IOException ioe) { log.error("parse() failed at Excel file=" + filename, ioe); throw new ConverterException("Excel::parse() failed at Excel file=" + filename, ioe); } catch (Exception e) { log.error("parse() failed at Excel file=" + filename, e); throw new ConverterException("Excel::parse() failed", e); } finally { IOUtils.closeQuietly(fin); } if (log.isDebugEnabled()) { log.debug("parse() Excel file='" + filename + "'" + Layout.LINE_SEP + "title='" + documentTitle + "'" + Layout.LINE_SEP + "author='" + documentAuthor + "'" + Layout.LINE_SEP + "keywords='" + documentKeywords + "'"); } // get text DocumentInputStream din = null; ExcelListener el = new ExcelListener(); try { // proceed to write to file // create a new file input stream with the input file specified // at the command line fin = new FileInputStream(filename); POIFSFileSystem poifs = new POIFSFileSystem(fin); din = poifs.createDocumentInputStream("Workbook"); HSSFRequest req = new HSSFRequest(); req.addListenerForAllRecords(el); HSSFEventFactory factory = new HSSFEventFactory(); factory.processEvents(req, din); fin.close(); // get text documentText = el.getText().toString(); } catch (IOException ioe) { log.error("parse() failed at Excel file=" + filename, ioe); throw new ConverterException("Excel::parse() failed at Excel file=" + filename, ioe); } catch (Exception e) { log.error("parse() failed", e); throw new ConverterException("Excel::parse() failed", e); } finally { IOUtils.closeQuietly(din); IOUtils.closeQuietly(fin); } }
From source file:org.jberet.support.io.ExcelEventItemReader.java
License:Open Source License
@Override protected void initWorkbookAndSheet(final int startRowNumber) throws Exception { queue = new ArrayBlockingQueue<Object>(queueCapacity == 0 ? MAX_WORKSHEET_ROWS : queueCapacity); final POIFSFileSystem poifs = new POIFSFileSystem(inputStream); // get the Workbook (excel part) stream in a InputStream documentInputStream = poifs.createDocumentInputStream("Workbook"); final HSSFRequest req = new HSSFRequest(); final MissingRecordAwareHSSFListener missingRecordAwareHSSFListener = new MissingRecordAwareHSSFListener( new HSSFListenerImpl(this)); /*// w ww . jav a 2 s.c o m * Need to use English locale her because Jackson double parsing might break in certain regions * where ',' is used as decimal separator instead of '.'. */ formatListener = new FormatTrackingHSSFListener(missingRecordAwareHSSFListener, Locale.ENGLISH); req.addListenerForAllRecords(formatListener); final HSSFEventFactory factory = new HSSFEventFactory(); if (objectMapper == null) { initJsonFactoryAndObjectMapper(); } new Thread(new Runnable() { @Override public void run() { try { factory.processEvents(req, documentInputStream); } catch (final ReadCompletedException e) { SupportLogger.LOGGER.tracef("Completed reading %s%n", resource); } } }).start(); }
From source file:org.jreserve.gui.poi.read.xls.XlsReader.java
License:Open Source License
private void openFile(File file) throws IOException { fin = new FileInputStream(file); POIFSFileSystem pfs = new POIFSFileSystem(fin); din = pfs.createDocumentInputStream(DOCUMENT_NAME); }
From source file:org.opencrx.kernel.text.ExcelToText.java
License:BSD License
/** * Extract text from XLS./*from w w w. j a v a 2s .c o m*/ * * @param document * @return * @throws ServiceException */ public Reader parse(InputStream document) throws ServiceException { try { this.text.setLength(0); POIFSFileSystem fs = new POIFSFileSystem(document); InputStream workbook = fs.createDocumentInputStream("Workbook"); HSSFRequest request = new HSSFRequest(); request.addListenerForAllRecords(this); HSSFEventFactory eventFactory = new HSSFEventFactory(); try { eventFactory.processEvents(request, workbook); } catch (Exception e) { throw new ServiceException(e); } catch (NoSuchMethodError e) { throw new ServiceException(BasicException.toExceptionStack(e)); } workbook.close(); return new StringReader(this.text.toString()); } catch (IOException e) { throw new ServiceException(e); } }
From source file:org.textmining.extraction.excel.ExcelTextExtractor.java
License:Open Source License
public ExcelTextExtractor(InputStream in) throws IOException { POIFSFileSystem poifs = new POIFSFileSystem(in); DocumentEntry headerProps = (DocumentEntry) poifs.getRoot().getEntry("Workbook"); DocumentInputStream din = poifs.createDocumentInputStream("Workbook"); _recordStream = new byte[headerProps.getSize()]; din.read(_recordStream);// ww w . j a v a2s . com din.close(); }
From source file:org.tonguetied.datatransfer.importing.ExcelImporter.java
License:Apache License
/** * This method initializes the parser enabling the parser to handle the * excel document./*from www. j a v a2 s. co m*/ * * @param input the byte code representation of the excel document * @throws ImportException if the input data fails to be parsed */ private void loadData(byte[] input) throws ImportException { ByteArrayInputStream bais = null; InputStream dis = null; try { bais = new ByteArrayInputStream(input); // create a new org.apache.poi.poifs.filesystem.Filesystem POIFSFileSystem poifs = new POIFSFileSystem(bais); // get the Workbook (excel part) stream in a InputStream dis = poifs.createDocumentInputStream("Workbook"); // construct out HSSFRequest object HSSFRequest req = new HSSFRequest(); // lazy listen for ALL records with the listener shown above req.addListenerForAllRecords(parser); // create our event factory HSSFEventFactory factory = new HSSFEventFactory(); // process our events based on the document input stream factory.processEvents(req, dis); } catch (IOException ioe) { throw new ImportException(ioe); } finally { // and our document input stream (don't want to leak these!) close(dis); // once all the events are processed close our file input stream close(bais); } }
From source file:org.tonguetied.datatransfer.importing.ExcelLanguageCentricParserTest.java
License:Apache License
/** * Test method for {@link org.tonguetied.datatransfer.importing.ExcelLanguageCentricParser#processRecord(org.apache.poi.hssf.record.Record)}. *//* w w w .j a v a 2s. c om*/ @Test public final void testProcessRecord() throws Exception { ExcelLanguageCentricParser parser = new ExcelLanguageCentricParser(keywordService); InputStream is = null; try { // create a new file input stream with the input file specified // at the command line File input = new File(TEST_DATA_DIR, "LanguageCentricImportData.xls"); is = new BufferedInputStream(new FileInputStream(input)); // create a new org.apache.poi.poifs.filesystem.Filesystem POIFSFileSystem poifs = new POIFSFileSystem(is); // get the Workbook (excel part) stream in a InputStream InputStream din = poifs.createDocumentInputStream("Workbook"); // construct out HSSFRequest object HSSFRequest req = new HSSFRequest(); // lazy listen for ALL records with the listener shown above req.addListenerForAllRecords(parser); // create our event factory HSSFEventFactory factory = new HSSFEventFactory(); // process our events based on the document input stream factory.processEvents(req, din); } finally { // once all the events are processed close our file input stream if (is != null) is.close(); } List<Language> languages = parser.getLanguages(); assertEquals(4, languages.size()); assertTrue(languages.contains(defaultLanguage)); assertTrue(languages.contains(hebrew)); assertTrue(languages.contains(simplifiedChinese)); assertTrue(languages.contains(traditionalChinese)); Map<String, Keyword> keywords = parser.getKeywords(); assertEquals(8, keywords.size()); Keyword actual = keywords.get(keyword1.getKeyword()); assessKeyword(keyword1, actual); actual = keywords.get(keyword2.getKeyword()); assessKeyword(keyword2, actual); }
From source file:org.tonguetied.datatransfer.importing.KeywordExcelParserTest.java
License:Apache License
/** * Test method for {@link org.tonguetied.datatransfer.importing.ExcelLanguageCentricParser#processRecord(org.apache.poi.hssf.record.Record)}. *///www. ja va 2 s. c o m @Test public final void testProcessRecord() throws Exception { ExcelParser parser = new ExcelKeywordParser(keywordService); InputStream is = null; try { // create a new file input stream with the input file specified // at the command line File input = new File(TEST_DATA_DIR, "KeywordExcelParserTest.xls"); is = new BufferedInputStream(new FileInputStream(input)); // create a new org.apache.poi.poifs.filesystem.Filesystem POIFSFileSystem poifs = new POIFSFileSystem(is); // get the Workbook (excel part) stream in a InputStream InputStream din = poifs.createDocumentInputStream("Workbook"); // construct out HSSFRequest object HSSFRequest req = new HSSFRequest(); // lazy listen for ALL records with the listener shown above req.addListenerForAllRecords(parser); // create our event factory HSSFEventFactory factory = new HSSFEventFactory(); // process our events based on the document input stream factory.processEvents(req, din); } finally { // once all the events are processed close our file input stream if (is != null) is.close(); } Map<String, Keyword> keywords = parser.getKeywords(); assertEquals(7, keywords.size()); Keyword actual = keywords.get(keyword1.getKeyword()); assessKeyword(keyword1, actual); actual = keywords.get(keyword2.getKeyword()); assessKeyword(keyword2, actual); actual = keywords.get(keyword3.getKeyword()); assessKeyword(keyword3, actual); actual = keywords.get(keyword4.getKeyword()); assessKeyword(keyword4, actual); assertTrue(actual.getTranslations().isEmpty()); actual = keywords.get(keyword5.getKeyword()); assessKeyword(keyword5, actual); final List<ImportErrorCode> errorCodes = parser.getErrorCodes(); assertEquals(6, errorCodes.size()); assertTrue(errorCodes.contains(ImportErrorCode.unknownCountry)); assertTrue(errorCodes.contains(ImportErrorCode.illegalCountry)); assertTrue(errorCodes.contains(ImportErrorCode.unknownLanguage)); assertTrue(errorCodes.contains(ImportErrorCode.illegalLanguage)); assertTrue(errorCodes.contains(ImportErrorCode.unknownBundle)); assertTrue(errorCodes.contains(ImportErrorCode.illegalTranslationState)); }