List of usage examples for org.apache.poi.poifs.filesystem DocumentInputStream read
@Override
public int read(byte[] b) throws IOException
From source file:com.argo.hwp.v5.HwpTextExtractorV5.java
License:Open Source License
/** * HWP? FileHeader /* w w w . j av a 2s . c om*/ * * @param fs * @return * @throws IOException */ private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException { DirectoryNode root = fs.getRoot(); // ??? p.18 // FileHeader Entry headerEntry = root.getEntry("FileHeader"); if (!headerEntry.isDocumentEntry()) return null; // ? byte[] header = new byte[256]; // FileHeader ? 256 DocumentInputStream headerStream = new DocumentInputStream((DocumentEntry) headerEntry); try { int read = headerStream.read(header); if (read != 256 || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(header, 0, HWP_V5_SIGNATURE.length))) return null; } finally { headerStream.close(); } FileHeader fileHeader = new FileHeader(); // . debug fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(header, 32)); long flags = LittleEndian.getUInt(header, 36); log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0')); fileHeader.compressed = (flags & 0x01) == 0x01; fileHeader.encrypted = (flags & 0x02) == 0x02; fileHeader.viewtext = (flags & 0x04) == 0x04; return fileHeader; }
From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close();// w w w. j a va2s. co m int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close();/*from ww w .j a va2 s. c o m*/ int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header, tpt); } CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // load our text pieces and our character runs List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.progdan.doc2txt.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document./*from w w w .j a v a 2 s. c o m*/ * * @param in The InputStream representing the Word file. */ public String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:net.freeutils.tnef.msg.Msg.java
License:Open Source License
protected static RawInputStream toRawInputStream(DocumentEntry entry) throws IOException { DocumentInputStream dis = new DocumentInputStream(entry); ByteArrayOutputStream bais = new ByteArrayOutputStream(dis.available()); try {/*from w w w . j av a 2 s.c o m*/ byte[] bytes = new byte[4096]; int count; while ((count = dis.read(bytes)) > -1) bais.write(bytes, 0, count); } finally { dis.close(); } return new RawInputStream(bais.toByteArray()); }
From source file:net.sf.mmm.content.parser.impl.poi.ContentParserPpt.java
License:Apache License
/** * {@inheritDoc}//www.ja va 2s . com */ @Override protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options) throws Exception { // PowerPointExtractor pptExtractor = new PowerPointExtractor(poiFs); // return pptExtractor.getText(); DocumentInputStream docStream = poiFs.createDocumentInputStream(POIFS_POWERPOINT_DOC); int length = docStream.available(); int maximumBufferSize = options.getMaximumBufferSize(); if (maximumBufferSize < length) { length = maximumBufferSize; } int capacity = length / 10; StringBuffer textBuffer = new StringBuffer(capacity); byte[] buffer = new byte[length]; docStream.read(buffer); docStream.close(); extractRecursive(buffer, 0, length, textBuffer); return textBuffer.toString(); }
From source file:net.sf.mpxj.utility.MppClean.java
License:Open Source License
/** * Extracts a block of data from the MPP file, and iterates through the map * of find/replace pairs to make the data anonymous. * //from w w w .ja va 2s. c o m * @param parentDirectory parent directory object * @param fileName target file name * @param replacements find/replace data * @param unicode true for double byte text * @throws IOException */ private void processReplacements(DirectoryEntry parentDirectory, String fileName, Map<String, String> replacements, boolean unicode) throws IOException { // // Populate a list of keys and sort into descending order of length // List<String> keys = new ArrayList<String>(replacements.keySet()); Collections.sort(keys, new Comparator<String>() { @Override public int compare(String o1, String o2) { return (o2.length() - o1.length()); } }); // // Extract the raw file data // DocumentEntry targetFile = (DocumentEntry) parentDirectory.getEntry(fileName); DocumentInputStream dis = new DocumentInputStream(targetFile); int dataSize = dis.available(); byte[] data = new byte[dataSize]; dis.read(data); // // Replace the text // for (String findText : keys) { String replaceText = replacements.get(findText); replaceData(data, findText, replaceText, unicode); } // // Remove the document entry // targetFile.delete(); // // Replace it with a new one // parentDirectory.createDocument(fileName, new ByteArrayInputStream(data)); }
From source file:net.sf.mpxj.utility.MppCleanUtility.java
License:Open Source License
/** * Extracts a block of data from the MPP file, and iterates through the map * of find/replace pairs to make the data anonymous. * //from w ww. j a v a2s .c om * @param parentDirectory parent directory object * @param fileName target file name * @param replacements find/replace data * @param unicode true for double byte text * @throws IOException */ private void processReplacements(DirectoryEntry parentDirectory, String fileName, Map<String, String> replacements, boolean unicode) throws IOException { // // Populate a list of keys and sort into descending order of length // List<String> keys = new ArrayList<String>(replacements.keySet()); Collections.sort(keys, new Comparator<String>() { @Override public int compare(String o1, String o2) { return (o2.length() - o1.length()); } }); // // Extract the raw file data // DocumentEntry targetFile = (DocumentEntry) parentDirectory.getEntry(fileName); DocumentInputStream dis = new DocumentInputStream(targetFile); int dataSize = dis.available(); byte[] data = new byte[dataSize]; dis.read(data); dis.close(); // // Replace the text // for (String findText : keys) { String replaceText = replacements.get(findText); replaceData(data, findText, replaceText, unicode); } // // Remove the document entry // targetFile.delete(); // // Replace it with a new one // parentDirectory.createDocument(fileName, new ByteArrayInputStream(data)); }
From source file:nz.govt.natlib.adapter.excel.ExcelAdapter.java
License:Apache License
public void readDocument(POIFSFileSystem fs, DocumentEntry doc) throws Exception { // load file system DocumentInputStream stream = new DocumentInputStream(doc); if (stream.available() > 256) { return;//ww w . j a v a2 s . c om } // process data from stream byte[] content = new byte[stream.available()]; stream.read(content); stream.close(); for (int i = 0; i < content.length; i++) { int c = content[i]; if (c < 0) { c = 0x100 + c; } } }
From source file:nz.govt.natlib.adapter.powerpoint.PowerPointAdapter.java
License:Apache License
public void readDocument(POIFSFileSystem fs, DocumentEntry doc) throws Exception { // load file system DocumentInputStream stream = new DocumentInputStream(doc); if (stream.available() > 256) { return;/*from w ww . ja v a 2 s.c om*/ } // process data from stream byte[] content = new byte[stream.available()]; stream.read(content); stream.close(); for (int i = 0; i < content.length; i++) { int c = content[i]; if (c < 0) { c = 0x100 + c; } System.out.println(i + ", " + Integer.toString(c) + "\t" + Integer.toHexString(c) + "\t" + (char) c); } }