List of usage examples for org.apache.poi.poifs.filesystem POIFSFileSystem createDocumentInputStream
public DocumentInputStream createDocumentInputStream(final String documentName) throws IOException
From source file:com.sonicle.webtop.core.io.input.ExcelFileReader.java
License:Open Source License
public HashMap<String, String> listXlsColumnNames(File file) throws IOException, FileReaderException { POIFSFileSystem pfs = null; InputStream is = null;// w w w . j a v a 2 s. co m try { pfs = new POIFSFileSystem(file); is = pfs.createDocumentInputStream("Workbook"); XlsColumnsProcessor processor = new XlsColumnsProcessor(is, headersRow, firstDataRow, lastDataRow, sheet); processor.process(); return processor.columnNames; } finally { IOUtils.closeQuietly(is); IOUtils.closeQuietly(pfs); } }
From source file:com.sonicle.webtop.core.io.input.ExcelFileReader.java
License:Open Source License
public HashMap<String, Integer> listXlsColumnIndexes(File file) throws IOException, FileReaderException { POIFSFileSystem pfs = null; InputStream is = null;//ww w . j av a 2 s . com try { pfs = new POIFSFileSystem(file); is = pfs.createDocumentInputStream("Workbook"); XlsColumnsProcessor processor = new XlsColumnsProcessor(is, headersRow, firstDataRow, lastDataRow, sheet); processor.process(); return processor.columnIndexes; } finally { IOUtils.closeQuietly(is); IOUtils.closeQuietly(pfs); } }
From source file:com.toolsverse.etl.metadata.excel.ExcelFileMetadata.java
License:Open Source License
@Override public DataSet getTablesByType(InputStream inputSteam, String name, String pattern, String type) throws Exception { DataSet dataSet = new DataSet(); dataSet.setName("tables"); FieldDef fieldDef = new FieldDef(); fieldDef.setName("File"); fieldDef.setSqlDataType(Types.VARCHAR); dataSet.addField(fieldDef);//from www .j av a2s .c o m fieldDef = new FieldDef(); fieldDef.setName("Name"); fieldDef.setSqlDataType(Types.VARCHAR); dataSet.addField(fieldDef); dataSet.setKeyFields("Name"); InputStream din = null; try { POIFSFileSystem poifs = new POIFSFileSystem(inputSteam); din = poifs.createDocumentInputStream("Workbook"); HSSFRequest req = new HSSFRequest(); req.addListenerForAllRecords(new SheetReader(name, dataSet)); HSSFEventFactory factory = new HSSFEventFactory(); try { factory.processEvents(req, din); } catch (Exception ex) { if (!Utils.isParticularException(ex, SHEETS_EXTRACTED_EXCEPTION)) throw ex; } } finally { if (din != null) din.close(); } return dataSet; }
From source file:com.toolsverse.etl.metadata.excel.ExcelFileMetadata.java
License:Open Source License
@Override public DataSet getTablesByType(String catalog, String schema, String pattern, String type) throws Exception { DataSet dataSet = new DataSet(); dataSet.setName(TABLES_DATASET_TYPE); FieldDef fieldDef = new FieldDef(); fieldDef.setName("File"); fieldDef.setSqlDataType(Types.VARCHAR); dataSet.addField(fieldDef);// www . ja v a 2 s .c om fieldDef = new FieldDef(); fieldDef.setName("Name"); fieldDef.setSqlDataType(Types.VARCHAR); dataSet.addField(fieldDef); dataSet.setKeyFields("Name"); FileInputStream fin = null; InputStream din = null; try { fin = new FileInputStream(catalog); POIFSFileSystem poifs = new POIFSFileSystem(fin); din = poifs.createDocumentInputStream("Workbook"); HSSFRequest req = new HSSFRequest(); req.addListenerForAllRecords(new SheetReader(catalog, dataSet)); HSSFEventFactory factory = new HSSFEventFactory(); try { factory.processEvents(req, din); } catch (Exception ex) { if (!Utils.isParticularException(ex, SHEETS_EXTRACTED_EXCEPTION)) throw ex; } } finally { if (fin != null) fin.close(); if (din != null) din.close(); } return dataSet; }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java
License:Apache License
/** * Handles an embedded OLE object in the document *//*from ww w.j a va 2 s. co m*/ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code // if (part.getSize() >= 0 && part.getSize() < 512*3) { // // Too small, skip // return; // } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { Metadata metadata = new Metadata(); TikaInputStream stream = null; metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj") && root.hasEntry("\u0003ObjInfo")) { // TIKA-704: OLE 2.0 embedded non-Office document? stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS")); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs); metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel()); byte[] data = ole.getDataBuffer(); if (data != null) { stream = TikaInputStream.get(data); } if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false); } } else { handleEmbeddedFile(part, handler, rel); } } catch (FileNotFoundException e) { // There was no CONTENTS entry, so skip this part } catch (Ole10NativeException e) { // Could not process an OLE 1.0 entry, so skip this part } }
From source file:net.sf.mmm.content.parser.impl.poi.AbstractContentParserPoi.java
License:Apache License
/** * {@inheritDoc}//from ww w. ja v a2 s .co m */ @Override public void parse(InputStream inputStream, long filesize, ContentParserOptions options, MutableGenericContext context) throws Exception { POIFSFileSystem poiFs = new POIFSFileSystem(inputStream); SummaryInformation summaryInfo = (SummaryInformation) PropertySetFactory .create(poiFs.createDocumentInputStream(SummaryInformation.DEFAULT_STREAM_NAME)); String title = summaryInfo.getTitle(); if (title != null) { context.setVariable(VARIABLE_NAME_TITLE, title); } String author = summaryInfo.getAuthor(); if (author != null) { context.setVariable(VARIABLE_NAME_CREATOR, author); } String keywords = summaryInfo.getKeywords(); if (keywords != null) { context.setVariable(VARIABLE_NAME_KEYWORDS, keywords); } context.setVariable(VARIABLE_NAME_TEXT, extractText(poiFs, filesize, options)); }
From source file:net.sf.mmm.content.parser.impl.poi.ContentParserPpt.java
License:Apache License
/** * {@inheritDoc}// w ww. j a v a 2 s . c o m */ @Override protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options) throws Exception { // PowerPointExtractor pptExtractor = new PowerPointExtractor(poiFs); // return pptExtractor.getText(); DocumentInputStream docStream = poiFs.createDocumentInputStream(POIFS_POWERPOINT_DOC); int length = docStream.available(); int maximumBufferSize = options.getMaximumBufferSize(); if (maximumBufferSize < length) { length = maximumBufferSize; } int capacity = length / 10; StringBuffer textBuffer = new StringBuffer(capacity); byte[] buffer = new byte[length]; docStream.read(buffer); docStream.close(); extractRecursive(buffer, 0, length, textBuffer); return textBuffer.toString(); }
From source file:net.sf.mmm.content.parser.impl.poi.ContentParserXls.java
License:Apache License
/** * {@inheritDoc}/* www .j a v a 2s. c om*/ */ @Override protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options) throws Exception { int maxBufferSize = options.getMaximumBufferSize(); int maxCharSize = maxBufferSize / 2; InputStream documentInputStream = poiFs.createDocumentInputStream(POIFS_EXCEL_DOC); // actually there seems no smart guess for the initial capacity of // textBuffer // the text length can have any ration to documentInputStream.available() // the only possibility would be to create the string buffer in the listener // from the size of the SSTRecord. In this case stable code is better than // saving a tiny percent of performance... StringBuilder textBuffer = new StringBuilder(1024); try { HSSFRequest req = new HSSFRequest(); req.addListenerForAllRecords(new ExcelListener(textBuffer, maxCharSize)); HSSFEventFactory factory = new HSSFEventFactory(); factory.processEvents(req, documentInputStream); } finally { documentInputStream.close(); } return textBuffer.toString(); }
From source file:no.trank.openpipe.parse.ms.POIUtils.java
License:Apache License
/** * Fetches the \005SummaryInformation and \005DocumentSummaryInformation streams from the poi * file system and exctracts all properties of primitive type, String or Date. * /* w ww . j a v a 2 s . co m*/ * @param fs the poi filesystem * @return the properties */ public static Map<String, String> getProperties(POIFSFileSystem fs) { Map<String, String> map = new HashMap<String, String>(); try { InputStream stream = fs.createDocumentInputStream(SummaryInformation.DEFAULT_STREAM_NAME); addProperties(map, PropertySetFactory.create(stream)); } catch (Exception e) { // ignore } try { InputStream stream = fs.createDocumentInputStream(DocumentSummaryInformation.DEFAULT_STREAM_NAME); addProperties(map, PropertySetFactory.create(stream)); } catch (Exception e) { // ignore } return map; }
From source file:org.apache.nutch.parse.msword.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document./*from www .ja v a 2 s . co m*/ * * @param in The InputStream representing the Word file. */ protected String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }