List of usage examples for org.apache.poi.util LittleEndian getInt
public static int getInt(byte[] data, int offset)
From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header);/* w w w. j a v a 2s . c om*/ din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.krawler.esp.fileparser.word.Word6Extractor.java
License:Open Source License
public String extractText(byte[] mainStream) throws IOException { int fcMin = LittleEndian.getInt(mainStream, 0x18); int fcMax = LittleEndian.getInt(mainStream, 0x1C); int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8); int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin); List textRuns = chpTable.getTextRuns(); // iterate through the WordTextBuffer finalTextBuf = new WordTextBuffer(); Iterator runsIt = textRuns.iterator(); while (runsIt.hasNext()) { CHPX chpx = (CHPX) runsIt.next(); int runStart = chpx.getStart() + fcMin; int runEnd = chpx.getEnd() + fcMin; if (!isDeleted(chpx.getGrpprl())) { String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); finalTextBuf.append(s);/*www . j a v a 2 s.c om*/ if (runEnd >= fcMax) { break; } } } return finalTextBuf.toString(); }
From source file:com.krawler.esp.fileparser.word.Word6Extractor.java
License:Open Source License
public String extractText(byte[] mainStream, TextPieceTable tpt) throws IOException { int fcMin = LittleEndian.getInt(mainStream, 0x18); int fcMax = LittleEndian.getInt(mainStream, 0x1C); int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8); int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin, tpt); List textRuns = chpTable.getTextRuns(); // iterate through the WordTextBuffer finalTextBuf = new WordTextBuffer(); Iterator runsIt = textRuns.iterator(); while (runsIt.hasNext()) { CHPX chpx = (CHPX) runsIt.next(); int runStart = chpx.getStart() + fcMin; int runEnd = chpx.getEnd() + fcMin; if (!isDeleted(chpx.getGrpprl())) { String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); finalTextBuf.append(s);/*from w ww . j av a2s . c om*/ if (runEnd >= fcMax) { break; } } } return finalTextBuf.toString(); }
From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java
License:Open Source License
public String extractText(String filepath) throws FastSavedException, IOException { InputStream iStream = new BufferedInputStream(new FileInputStream(filepath)); ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(iStream); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header);//from ww w . j av a 2s.c o m din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { System.out.println("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); // Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; // get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header, tpt); } CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); // load our text pieces and our character runs List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't // been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:com.progdan.doc2txt.sprm.SprmOperation.java
License:Apache License
public SprmOperation(byte[] grpprl, int offset) { short sprmStart = LittleEndian.getShort(grpprl, offset); offset += 2;//w w w .j a va 2 s.c om _operation = OP_BITFIELD.getValue(sprmStart); _type = TYPE_BITFIELD.getValue(sprmStart); int sizeCode = SIZECODE_BITFIELD.getValue(sprmStart); switch (sizeCode) { case 0: case 1: _operand = LittleEndian.getUnsignedByte(grpprl, offset); _sizeNeeded = 3; break; case 2: case 4: case 5: _operand = LittleEndian.getShort(grpprl, offset); _sizeNeeded = 4; break; case 3: _operand = LittleEndian.getInt(grpprl, offset); _sizeNeeded = 6; break; case 6: _varOperand = new byte[grpprl[offset++]]; System.arraycopy(grpprl, offset, _varOperand, 0, _varOperand.length); _sizeNeeded = _varOperand.length + 3; break; case 7: byte threeByteInt[] = new byte[4]; threeByteInt[0] = grpprl[offset]; threeByteInt[1] = grpprl[offset + 1]; threeByteInt[2] = grpprl[offset + 2]; threeByteInt[3] = (byte) 0; _operand = LittleEndian.getInt(threeByteInt, 0); _sizeNeeded = 5; break; } }
From source file:com.progdan.doc2txt.Word6Extractor.java
License:Apache License
/** * Extracts the text/*from www . j a va 2s .com*/ * * @param mainStream The POIFS document stream entitled "WordDocument". * * @return The text from the document * @throws Exception If there are any unexpected exceptions. */ public String extractText(byte[] mainStream) throws Exception { int fcMin = LittleEndian.getInt(mainStream, 0x18); int fcMax = LittleEndian.getInt(mainStream, 0x1C); int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8); int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin); List textRuns = chpTable.getTextRuns(); // iterate through the WordTextBuffer finalTextBuf = new WordTextBuffer(); Iterator runsIt = textRuns.iterator(); while (runsIt.hasNext()) { CHPX chpx = (CHPX) runsIt.next(); int runStart = chpx.getStart() + fcMin; int runEnd = chpx.getEnd() + fcMin; if (!isDeleted(chpx.getGrpprl())) { String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); finalTextBuf.append(s); if (runEnd >= fcMax) { break; } } } return finalTextBuf.toString(); }
From source file:com.progdan.doc2txt.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document./*www .j a v a2s.c o m*/ * * @param in The InputStream representing the Word file. */ public String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Extracts the client text boxes of a slide. * /*from www . j av a 2 s . co m*/ * @param containerTextBox * @param offset * @param pptdata * @param offsetPD * @return Hashtable * @see TextBox */ protected Hashtable/* <Long, TextBox> */ extractTextBoxes(final Hashtable/* <Long, TextBox> */ containerTextBox, final int offset, final byte[] pptdata, final long offsetPD) { // To hold temporary data FilteredStringWriter outStream = new FilteredStringWriter(); TextBox textBox; // Traversing the bytearray up to Presist directory position for (int i = offset; i < offsetPD - 20; i++) { try { // Record info // final long rinfo = LittleEndian.getUShort(pptdata, (int) i); // Record Type final long recordType = LittleEndian.getUShort(pptdata, i + 2); // Record Size final long recordSize = LittleEndian.getUInt(pptdata, i + 4); if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Record type is of Drawing Group */ // Total number of objects // final long objectCount = LittleEndian.getUInt(pptdata, (int) i + // 8); // currentID = Group ID+number of objects long currentID = LittleEndian.getInt(pptdata, i + 12); currentID = ((int) (currentID / 1024)) * 1024; if (currentID == PPTConstants.PPT_MASTERSLIDE) { // Ignore Master Slide objects if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); } i++; continue; } // Check for the ClientTextBox GroupID existence if (containerTextBox.containsKey(new Long(currentID))) { // If exists get Client Textbox Group textBox = (TextBox) containerTextBox.get(new Long(currentID)); textBox.setContent(""); } else { textBox = new TextBox(currentID); containerTextBox.put(new Long(currentID), textBox); } /* * Iterating the bytearray for TextCharAtoms and TextBytesAtom */ if ((offsetPD - 20) != recordSize) { // TODO something wrong? Probably an OLE-Object, which we ignore. if (LOG.isDebugEnabled()) { LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize=" + recordSize); } } else { for (int startPos = i + 8; startPos < offsetPD - 20 && startPos < recordSize; startPos++) { // && startPos < // recordSize?? try { // Record info // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j); // Record Type final long ntype = LittleEndian.getUShort(pptdata, startPos + 2); // Record size // Note that the size doesn't include the 8 byte atom header final long nsize = LittleEndian.getUInt(pptdata, startPos + 4); if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Break the loop if next GroupID found */ i = startPos - 1; break; } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) { // TextByteAtom record outStream = new FilteredStringWriter(); long ii = 0; for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) { // For loop to changed to a function // if ((ii + 2) >= pptdata.length) // break; // FIXME outStream.write((char) (pptdata[(int) ii + 2])); } // Setting the identified text for Current // groupID textBox.setContent(textBox.getContent() + outStream.toString()); } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) { // TextCharAtom record final String strTempContent = new String(pptdata, startPos + 6, (int) (nsize) + 2); final byte bytes[] = strTempContent.getBytes(); if (true) { outStream = new FilteredStringWriter(); for (int ii = 0; ii < bytes.length - 1; ii += 2) { // For loop to changed to a function outStream.write((char) (pptdata[ii + 2])); } textBox.setContent(textBox.getContent() + outStream.toString()); } else { // this version is used within POI String text = StringUtil.getFromCompressedUnicode(bytes, 0, bytes.length); textBox.setContent(textBox.getContent() + text); } } else { // ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Ignored atom type: " + type); // } } } catch (Throwable e) { if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); } break; } } } } else { // Record type is ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Ignored record type: " + type); // } } } catch (Throwable ee) { if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); } break; } } return containerTextBox; }
From source file:org.apache.nutch.parse.msword.WordExtractor.java
License:Apache License
/** * Gets the text from a Word document.//from ww w .j av a 2 s.c om * * @param in The InputStream representing the Word file. */ protected String extractText(InputStream in) throws Exception { ArrayList text = new ArrayList(); POIFSFileSystem fsys = new POIFSFileSystem(in); // load our POIFS document streams. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); int info = LittleEndian.getShort(header, 0xa); if ((info & 0x4) != 0) { throw new FastSavedException("Fast-saved files are unsupported at this time"); } if ((info & 0x100) != 0) { throw new PasswordProtectedException("This document is password protected"); } // determine the version of Word this document came from. int nFib = LittleEndian.getShort(header, 0x2); switch (nFib) { case 101: case 102: case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. Word6Extractor oldExtractor = new Word6Extractor(); return oldExtractor.extractText(header); } //Get the information we need from the header boolean useTable1 = (info & 0x200) != 0; //get the location of the piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // determine which table stream we must use. String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection din = null; fsys = null; table = null; headerProps = null; List textRuns = cbt.getTextRuns(); Iterator runIt = textRuns.iterator(); Iterator textIt = textPieces.iterator(); TextPiece currentPiece = (TextPiece) textIt.next(); int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); WordTextBuffer finalTextBuf = new WordTextBuffer(); // iterate through all text runs extract the text only if they haven't been // deleted while (runIt.hasNext()) { CHPX chpx = (CHPX) runIt.next(); boolean deleted = isDeleted(chpx.getGrpprl()); if (deleted) { continue; } int runStart = chpx.getStart(); int runEnd = chpx.getEnd(); while (runStart >= currentTextEnd) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } if (runEnd < currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); finalTextBuf.append(str); } else if (runEnd > currentTextEnd) { while (runEnd > currentTextEnd) { String str = currentPiece.substring(runStart - currentTextStart, currentTextEnd - currentTextStart); finalTextBuf.append(str); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); runStart = currentTextStart; currentTextEnd = currentPiece.getEnd(); } else { return finalTextBuf.toString(); } } String str = currentPiece.substring(0, runEnd - currentTextStart); finalTextBuf.append(str); } else { String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart); if (textIt.hasNext()) { currentPiece = (TextPiece) textIt.next(); currentTextStart = currentPiece.getStart(); currentTextEnd = currentPiece.getEnd(); } finalTextBuf.append(str); } } return finalTextBuf.toString(); }
From source file:org.apache.tika.parser.executable.ExecutableParser.java
License:Apache License
/** * Parses a DOS or Windows PE file// w w w . j a va2 s . com */ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream, byte[] first4) throws TikaException, IOException { metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString()); metadata.set(PLATFORM, PLATFORM_WINDOWS); // Skip over the MS-DOS bit byte[] msdosSection = new byte[0x3c - 4]; IOUtils.readFully(stream, msdosSection); // Grab the PE header offset int peOffset = LittleEndian.readInt(stream); // Sanity check - while it may go anywhere, it's normally in the first few kb if (peOffset > 4096 || peOffset < 0x3f) return; // Skip the rest of the MS-DOS stub (if PE), until we reach what should // be the PE header (if this is a PE executable) stream.skip(peOffset - 0x40); // Read the PE header byte[] pe = new byte[24]; IOUtils.readFully(stream, pe); // Check it really is a PE header if (pe[0] == (byte) 'P' && pe[1] == (byte) 'E' && pe[2] == 0 && pe[3] == 0) { // Good, has a valid PE signature } else { // Old style MS-DOS return; } // Read the header values int machine = LittleEndian.getUShort(pe, 4); int numSectors = LittleEndian.getUShort(pe, 6); long createdAt = LittleEndian.getInt(pe, 8); long symbolTableOffset = LittleEndian.getInt(pe, 12); long numSymbols = LittleEndian.getInt(pe, 16); int sizeOptHdrs = LittleEndian.getUShort(pe, 20); int characteristcs = LittleEndian.getUShort(pe, 22); // Turn this into helpful metadata Date createdAtD = new Date(createdAt * 1000l); metadata.set(Metadata.CREATION_DATE, createdAtD); switch (machine) { case 0x14c: metadata.set(MACHINE_TYPE, MACHINE_x86_32); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x8664: metadata.set(MACHINE_TYPE, MACHINE_x86_32); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "64"); break; case 0x200: metadata.set(MACHINE_TYPE, MACHINE_IA_64); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "64"); break; case 0x184: metadata.set(MACHINE_TYPE, MACHINE_ALPHA); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x284: metadata.set(MACHINE_TYPE, MACHINE_ALPHA); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "64"); break; case 0x1c0: case 0x1c4: metadata.set(MACHINE_TYPE, MACHINE_ARM); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x268: metadata.set(MACHINE_TYPE, MACHINE_M68K); metadata.set(ENDIAN, Endian.BIG.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x266: case 0x366: case 0x466: metadata.set(MACHINE_TYPE, MACHINE_MIPS); metadata.set(ENDIAN, Endian.BIG.getName()); metadata.set(ARCHITECTURE_BITS, "16"); break; case 0x162: case 0x166: case 0x168: case 0x169: metadata.set(MACHINE_TYPE, MACHINE_MIPS); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "16"); break; case 0x1f0: case 0x1f1: metadata.set(MACHINE_TYPE, MACHINE_PPC); metadata.set(ENDIAN, Endian.LITTLE.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x1a2: case 0x1a3: metadata.set(MACHINE_TYPE, MACHINE_SH3); metadata.set(ENDIAN, Endian.BIG.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x1a6: metadata.set(MACHINE_TYPE, MACHINE_SH4); metadata.set(ENDIAN, Endian.BIG.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x1a8: metadata.set(MACHINE_TYPE, MACHINE_SH3); metadata.set(ENDIAN, Endian.BIG.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0x9041: metadata.set(MACHINE_TYPE, MACHINE_M32R); metadata.set(ENDIAN, Endian.BIG.getName()); metadata.set(ARCHITECTURE_BITS, "32"); break; case 0xebc: metadata.set(MACHINE_TYPE, MACHINE_EFI); break; default: metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN); break; } }