List of usage examples for org.apache.poi.poifs.filesystem POIFSFileSystem getRoot
public DirectoryNode getRoot()
From source file:NewEmptyJUnitTest.java
/** * Test that we can get data from two different * embeded word documents/*from ww w . ja v a 2 s. com*/ * @throws Exception */ public void testExtractFromEmbeded() throws Exception { POIFSFileSystem fs = new POIFSFileSystem( POIDataSamples.getSpreadSheetInstance().openResourceAsStream(filename3)); HWPFDocument doc; WordExtractor extractor3; DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7"); DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2"); // Should have WordDocument and 1Table assertNotNull(dirA.getEntry("1Table")); assertNotNull(dirA.getEntry("WordDocument")); assertNotNull(dirB.getEntry("1Table")); assertNotNull(dirB.getEntry("WordDocument")); // Check each in turn doc = new HWPFDocument(dirA, fs); extractor3 = new WordExtractor(doc); assertNotNull(extractor3.getText()); assertTrue(extractor3.getText().length() > 20); assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractor3.getText()); assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle()); assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject()); doc = new HWPFDocument(dirB, fs); extractor3 = new WordExtractor(doc); assertNotNull(extractor3.getText()); assertTrue(extractor3.getText().length() > 20); assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractor3.getText()); assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle()); assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject()); }
From source file:NewEmptyJUnitTest.java
/** * [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes * ConcurrentModificationException in Tika's OfficeParser *///w ww . jav a 2 s .c o m public void testBug51686() throws IOException { InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream("Bug51686.doc"); POIFSFileSystem fs = new POIFSFileSystem(is); String text = null; for (Entry entry : fs.getRoot()) { if ("WordDocument".equals(entry.getName())) { WordExtractor ex = new WordExtractor(fs); try { text = ex.getText(); } finally { ex.close(); } } } assertNotNull(text); }
From source file:com.auxilii.msgparser.MsgParser.java
License:Open Source License
/** * Parses a .msg file provided by an input stream. * * @param msgFileStream The .msg file as a InputStream. * @return A {@link Message} object representing the .msg file. * @throws IOException Thrown if the file could not be loaded or parsed. *///from w w w.java2 s . com public Message parseMsg(InputStream msgFileStream) throws IOException { // the .msg file, like a file system, contains directories // and documents within this directories // we now gain access to the root node // and recursively go through the complete 'filesystem'. POIFSFileSystem fs = new POIFSFileSystem(msgFileStream); DirectoryEntry dir = fs.getRoot(); Message msg = new Message(); parseMsg(dir, msg); return msg; }
From source file:com.healthmarketscience.jackcess.util.OleBlobTest.java
License:Apache License
private static void checkCompoundStorage(OleBlob.CompoundContent cc, Attachment attach) throws Exception { File tmpData = File.createTempFile("attach_", ".dat"); try {/*from w w w . ja v a 2 s . c o m*/ FileOutputStream fout = new FileOutputStream(tmpData); fout.write(attach.getFileData()); fout.close(); POIFSFileSystem attachFs = new POIFSFileSystem(tmpData, true); for (OleBlob.CompoundContent.Entry e : cc) { DocumentEntry attachE = null; try { attachE = CompoundOleUtil.getDocumentEntry(e.getName(), attachFs.getRoot()); } catch (FileNotFoundException fnfe) { // ignored, the ole data has extra entries continue; } byte[] attachEBytes = toByteArray(new DocumentInputStream(attachE), attachE.getSize()); byte[] entryBytes = toByteArray(e.getStream(), e.length()); assertTrue(Arrays.equals(attachEBytes, entryBytes)); } ByteUtil.closeQuietly(attachFs); } finally { tmpData.delete(); } }
From source file:com.hp.octane.integrations.uft.UftTestDiscoveryUtils.java
License:Apache License
private static String extractXmlContentFromTspFile(InputStream stream) throws IOException { POIFSFileSystem poiFS = new POIFSFileSystem(stream); DirectoryNode root = poiFS.getRoot(); String xmlData = ""; for (Entry entry : root) { String name = entry.getName(); if ("ComponentInfo".equals(name)) { if (entry instanceof DirectoryEntry) { System.out.println(entry); } else if (entry instanceof DocumentEntry) { byte[] content = new byte[((DocumentEntry) entry).getSize()]; int readBytes = poiFS.createDocumentInputStream("ComponentInfo").read(content); if (readBytes < content.length) { // [YG] probably should handle this case and continue to read logger.warn("expected to read " + content.length + " bytes, but read and stopped after " + readBytes); }/*from w w w.jav a2s. c o m*/ String fromUnicodeLE = StringUtil.getFromUnicodeLE(content); xmlData = fromUnicodeLE.substring(fromUnicodeLE.indexOf('<')).replaceAll("\u0000", ""); } } } return xmlData; }
From source file:com.hpe.application.automation.tools.octane.actions.UFTTestUtil.java
License:Open Source License
public static String decodeXmlContent(InputStream stream) throws IOException { POIFSFileSystem poiFS = new POIFSFileSystem(stream); DirectoryNode root = poiFS.getRoot(); String xmlData = ""; for (Entry entry : root) { String name = entry.getName(); if ("ComponentInfo".equals(name)) { if (entry instanceof DirectoryEntry) { System.out.println(entry); } else if (entry instanceof DocumentEntry) { byte[] content = new byte[((DocumentEntry) entry).getSize()]; poiFS.createDocumentInputStream("ComponentInfo").read(content); String fromUnicodeLE = StringUtil.getFromUnicodeLE(content); xmlData = fromUnicodeLE.substring(fromUnicodeLE.indexOf('<')).replaceAll("\u0000", ""); }/*w ww. java 2 s . com*/ } } return xmlData; }
From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header);/*from w ww . j av a2 s.com*/ din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header);/* w w w . j a va 2 s. co m*/ din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header, tpt); } CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // load our text pieces and our character runs List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.oneis.graphics.ThumbnailFinder.java
License:Mozilla Public License
/** * Try and get a thumbnail from an old Microsoft Office document *//* w w w . ja va 2 s . c o m*/ private void findFromOldMSOffice() { try { File poiFilesystem = new File(inFilename); // Open the POI filesystem. InputStream is = new FileInputStream(poiFilesystem); POIFSFileSystem poifs = new POIFSFileSystem(is); is.close(); // Read the summary information. DirectoryEntry dir = poifs.getRoot(); DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(siEntry); PropertySet ps = new PropertySet(dis); dis.close(); SummaryInformation si = new SummaryInformation(ps); if (si != null) { byte[] thumbnailData = si.getThumbnail(); if (thumbnailData != null) { Thumbnail thumbnail = new Thumbnail(thumbnailData); byte[] wmf = thumbnail.getThumbnailAsWMF(); // Got something! thumbnailDimensions = tryWMFFormat(new ByteArrayInputStream(wmf), outFilename, outFormat, maxDimension); } } } catch (Exception e) { logIgnoredException("ThumbnailFinder Apache POI file reading failed", e); } }
From source file:com.orange.ocara.model.export.docx.AuditDocxExporter.java
License:Mozilla Public License
/** * Create OleObject using a sample.//from w w w. ja v a 2 s. c o m * * @param from File to embed * @param to Destination file */ private void createOleObject(File from, File to) throws IOException, Ole10NativeException { File existingOleObject = new File(templateDirectory, "word/embeddings/oleObject.bin"); OutputStream os = null; try { // When POIFSFileSystem fs = new POIFSFileSystem(FileUtils.openInputStream(existingOleObject)); fs.getRoot().getEntry(Ole10Native.OLE10_NATIVE).delete(); Ole10Native ole = new Ole10Native(from.getName(), from.getName(), from.getName(), IOUtils.toByteArray(FileUtils.openInputStream(from))); ByteArrayOutputStream stream = new ByteArrayOutputStream(); ole.writeOut(stream); fs.getRoot().createDocument(Ole10Native.OLE10_NATIVE, new ByteArrayInputStream(stream.toByteArray())); os = FileUtils.openOutputStream(to); fs.writeFilesystem(os); } finally { IOUtils.closeQuietly(os); } }