List of usage examples for org.apache.poi.util LittleEndian getShort
public static short getShort(byte[] data, int offset)
From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header);//www. j a va 2 s .c om din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header);// w w w .j a va2s . co m din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header, tpt); } CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // load our text pieces and our character runs List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.progdan.doc2txt.sprm.SprmOperation.java
License:Apache License
public SprmOperation(byte[] grpprl, int offset) { short sprmStart = LittleEndian.getShort(grpprl, offset); offset += 2;//from ww w. ja va2s .co m _operation = OP_BITFIELD.getValue(sprmStart); _type = TYPE_BITFIELD.getValue(sprmStart); int sizeCode = SIZECODE_BITFIELD.getValue(sprmStart); switch (sizeCode) { case 0: case 1: _operand = LittleEndian.getUnsignedByte(grpprl, offset); _sizeNeeded = 3; break; case 2: case 4: case 5: _operand = LittleEndian.getShort(grpprl, offset); _sizeNeeded = 4; break; case 3: _operand = LittleEndian.getInt(grpprl, offset); _sizeNeeded = 6; break; case 6: _varOperand = new byte[grpprl[offset++]]; System.arraycopy(grpprl, offset, _varOperand, 0, _varOperand.length); _sizeNeeded = _varOperand.length + 3; break; case 7: byte threeByteInt[] = new byte[4]; threeByteInt[0] = grpprl[offset]; threeByteInt[1] = grpprl[offset + 1]; threeByteInt[2] = grpprl[offset + 2]; threeByteInt[3] = (byte) 0; _operand = LittleEndian.getInt(threeByteInt, 0); _sizeNeeded = 5; break; } }
From source file:com.progdan.doc2txt.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document./*from w w w .j av a 2 s .c o m*/ * * @param in The InputStream representing the Word file. */ public String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:org.apache.nutch.parse.msword.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document./*from w ww.ja v a 2 s. co m*/ * * @param in The InputStream representing the Word file. */ protected String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:org.ddt.listener.dsi.HeadingPairProperty.java
License:Apache License
/** * the constructor.//from ww w. j av a 2 s. c o m * * * @param data the data to read from. * @param dataOffset the offset into the * <code>data</code> byte array. * @param docPartsOffset the offset of the corresponding docparts. * @throws IllegalVariantTypeException if the data is malformed. * @throws UnsupportedEncodingException */ HeadingPairProperty(byte[] data, int dataOffset, int docPartsOffset) throws IllegalVariantTypeException, UnsupportedEncodingException { int off = dataOffset; name = new StringProperty(data, off); off += name.getSize(); long type = LittleEndian.getUInt(data, off); if (type != Variant.VT_I4) { log.log(Level.WARNING, "Not a proper VT_I4 type."); throw new IllegalVariantTypeException(type, name); } off += LittleEndian.INT_SIZE; //this is a horrible workaround, around the bug in HPSF, that returns //cutoff byte arrays from Section.getProperty() (HPFS Bug #52337) //It hopes that there aren't too many parts per heading (i.e. worst //case it can be store in one byte...) int left = data.length - off; if (left >= LittleEndian.INT_SIZE) { partsCount = (int) LittleEndian.getUInt(data, off); off += LittleEndian.INT_SIZE; } else if (left >= LittleEndian.SHORT_SIZE) { partsCount = LittleEndian.getShort(data, off); off += left; } else if (left >= LittleEndian.BYTE_SIZE) { partsCount = LittleEndian.getUByte(data, off); off += left; } else { partsCount = 1; //default... maybe not a good idea. } size = off - dataOffset; this.docPartsOffset = docPartsOffset; }
From source file:org.ddt.listener.records.DConRefRecord.java
License:Apache License
/** * Read constructor./* ww w . j av a 2 s .c o m*/ * * @param data byte array containing a DConRef Record, including the header. */ public DConRefRecord(byte[] data) { int offset = 0; if (!(LittleEndian.getShort(data, offset) == DConRefRecord.sid)) throw new RecordFormatException("incompatible sid."); offset += LittleEndian.SHORT_SIZE; //length = LittleEndian.getShort(data, offset); offset += LittleEndian.SHORT_SIZE; firstRow = LittleEndian.getUShort(data, offset); offset += LittleEndian.SHORT_SIZE; lastRow = LittleEndian.getUShort(data, offset); offset += LittleEndian.SHORT_SIZE; firstCol = LittleEndian.getUByte(data, offset); offset += LittleEndian.BYTE_SIZE; lastCol = LittleEndian.getUByte(data, offset); offset += LittleEndian.BYTE_SIZE; charCount = LittleEndian.getUShort(data, offset); offset += LittleEndian.SHORT_SIZE; if (charCount < 2) throw new org.apache.poi.hssf.record.RecordFormatException("Character count must be >= 2"); charType = LittleEndian.getUByte(data, offset); offset += LittleEndian.BYTE_SIZE; //7 bits reserved + 1 bit type /* * bytelength is the length of the string in bytes, which depends on whether the string is * made of single- or double-byte chars. This is given by charType, which equals 0 if * single-byte, 1 if double-byte. */ int byteLength = charCount * ((charType & 1) + 1); path = LittleEndian.getByteArray(data, offset, byteLength); offset += byteLength; /* * If it's a self reference, the last one or two bytes (depending on char type) are the * unused field. Not sure If i need to bother with this... */ if (path[0] == 0x02) _unused = LittleEndian.getByteArray(data, offset, (charType + 1)); }
From source file:org.textmining.extraction.excel.ExcelTextExtractor.java
License:Open Source License
public void getText(Writer writer) throws IOException { while (_offset < _recordStream.length) { int type = LittleEndian.getShort(_recordStream, _offset); _offset += LittleEndian.SHORT_SIZE; if (type == 0xa) { //if (_offset == _recordStream.length) break; // else // { // continue; // } }//from w w w .j a va 2s .co m int size = LittleEndian.getShort(_recordStream, _offset); _offset += LittleEndian.SHORT_SIZE; if (type == Record.SST_RECORD) { int totalStrings = LittleEndian.getInt(_recordStream, _offset); _offset += LittleEndian.INT_SIZE; int sharedStrings = LittleEndian.getInt(_recordStream, _offset); _offset += LittleEndian.INT_SIZE; for (int x = 0; x < sharedStrings; x++) { int strLength = LittleEndian.getShort(_recordStream, _offset); _offset += LittleEndian.SHORT_SIZE; int flags = _recordStream[_offset++]; boolean compression = (flags & 0x1) == 0; boolean asian = (flags & 0x4) != 0; boolean richText = (flags & 8) != 0; int numRuns = 0; int sizeofAsian = 0; if (richText) { numRuns = LittleEndian.getShort(_recordStream, _offset); _offset += LittleEndian.SHORT_SIZE; } if (asian) { sizeofAsian = LittleEndian.getInt(_recordStream, _offset); _offset += LittleEndian.SHORT_SIZE; } int byteLength = !compression ? strLength * 2 : strLength; String string = new String(_recordStream, _offset, byteLength, compression ? "Cp1252" : "UTF-16LE"); //System.out.println(string); writer.write(string + ' '); _offset += byteLength; if (richText) { _offset += (numRuns * 4); } } } else { _offset += size; } } }
From source file:org.textmining.extraction.word.ComplexFileTable.java
License:Open Source License
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { //skips through the prms before we reach the piece table. These contain data //for actual fast saved files while (tableStream[offset] == GRPPRL_TYPE) { offset++;// www .java 2 s . co m int skip = LittleEndian.getShort(tableStream, offset); offset += LittleEndian.SHORT_SIZE + skip; } if (tableStream[offset] != TEXT_PIECE_TABLE_TYPE) { throw new IOException("The text piece table is corrupted"); } else { int pieceTableSize = LittleEndian.getInt(tableStream, ++offset); offset += LittleEndian.INT_SIZE; _tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); } }
From source file:org.textmining.extraction.word.model.PieceDescriptor.java
License:Open Source License
public PieceDescriptor(byte[] buf, int offset) { descriptor = LittleEndian.getShort(buf, offset); offset += LittleEndian.SHORT_SIZE;/*from w ww . j a va2s .c om*/ fc = LittleEndian.getInt(buf, offset); offset += LittleEndian.INT_SIZE; prm = LittleEndian.getShort(buf, offset); // see if this piece uses unicode. if ((fc & 0x40000000) == 0) { unicode = true; } else { unicode = false; fc &= ~(0x40000000);//gives me FC in doc stream fc /= 2; } }