Example usage for org.apache.poi.util LittleEndian getInt

List of usage examples for org.apache.poi.util LittleEndian getInt

Introduction

In this page you can find the example usage for org.apache.poi.util LittleEndian getInt.

Prototype

public static int getInt(byte[] data, int offset) 

Source Link

Document

get an int value from a byte array

Usage

From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);/* w w w.  j  a  v a  2s . c  om*/
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.krawler.esp.fileparser.word.Word6Extractor.java

License:Open Source License

public String extractText(byte[] mainStream) throws IOException {
    int fcMin = LittleEndian.getInt(mainStream, 0x18);
    int fcMax = LittleEndian.getInt(mainStream, 0x1C);

    int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
    int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);

    // get a list of character properties
    Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin);
    List textRuns = chpTable.getTextRuns();

    // iterate through the
    WordTextBuffer finalTextBuf = new WordTextBuffer();
    Iterator runsIt = textRuns.iterator();
    while (runsIt.hasNext()) {
        CHPX chpx = (CHPX) runsIt.next();
        int runStart = chpx.getStart() + fcMin;
        int runEnd = chpx.getEnd() + fcMin;

        if (!isDeleted(chpx.getGrpprl())) {
            String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
            finalTextBuf.append(s);/*www .  j  a  v  a 2 s.c om*/
            if (runEnd >= fcMax) {
                break;
            }
        }
    }

    return finalTextBuf.toString();
}

From source file:com.krawler.esp.fileparser.word.Word6Extractor.java

License:Open Source License

public String extractText(byte[] mainStream, TextPieceTable tpt) throws IOException {
    int fcMin = LittleEndian.getInt(mainStream, 0x18);
    int fcMax = LittleEndian.getInt(mainStream, 0x1C);

    int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
    int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);

    // get a list of character properties
    Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin, tpt);
    List textRuns = chpTable.getTextRuns();

    // iterate through the
    WordTextBuffer finalTextBuf = new WordTextBuffer();
    Iterator runsIt = textRuns.iterator();
    while (runsIt.hasNext()) {
        CHPX chpx = (CHPX) runsIt.next();
        int runStart = chpx.getStart() + fcMin;
        int runEnd = chpx.getEnd() + fcMin;

        if (!isDeleted(chpx.getGrpprl())) {
            String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
            finalTextBuf.append(s);/*from  w ww .  j  av  a2s .  c om*/
            if (runEnd >= fcMax) {
                break;
            }
        }
    }

    return finalTextBuf.toString();
}

From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);//from  ww w . j  av a  2s.c  o m
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header, tpt);
    }
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);
    // load our text pieces and our character runs

    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.progdan.doc2txt.sprm.SprmOperation.java

License:Apache License

public SprmOperation(byte[] grpprl, int offset) {
    short sprmStart = LittleEndian.getShort(grpprl, offset);
    offset += 2;//w  w w .j a  va  2 s.c  om

    _operation = OP_BITFIELD.getValue(sprmStart);
    _type = TYPE_BITFIELD.getValue(sprmStart);
    int sizeCode = SIZECODE_BITFIELD.getValue(sprmStart);

    switch (sizeCode) {
    case 0:
    case 1:
        _operand = LittleEndian.getUnsignedByte(grpprl, offset);
        _sizeNeeded = 3;
        break;
    case 2:
    case 4:
    case 5:
        _operand = LittleEndian.getShort(grpprl, offset);
        _sizeNeeded = 4;
        break;
    case 3:
        _operand = LittleEndian.getInt(grpprl, offset);
        _sizeNeeded = 6;
        break;
    case 6:
        _varOperand = new byte[grpprl[offset++]];
        System.arraycopy(grpprl, offset, _varOperand, 0, _varOperand.length);
        _sizeNeeded = _varOperand.length + 3;
        break;
    case 7:
        byte threeByteInt[] = new byte[4];
        threeByteInt[0] = grpprl[offset];
        threeByteInt[1] = grpprl[offset + 1];
        threeByteInt[2] = grpprl[offset + 2];
        threeByteInt[3] = (byte) 0;
        _operand = LittleEndian.getInt(threeByteInt, 0);
        _sizeNeeded = 5;
        break;

    }
}

From source file:com.progdan.doc2txt.Word6Extractor.java

License:Apache License

/**
 * Extracts the text/*from  www .  j a  va  2s  .com*/
 *
 * @param mainStream The POIFS document stream entitled "WordDocument".
 *
 * @return The text from the document
 * @throws Exception If there are any unexpected exceptions.
 */
public String extractText(byte[] mainStream) throws Exception {
    int fcMin = LittleEndian.getInt(mainStream, 0x18);
    int fcMax = LittleEndian.getInt(mainStream, 0x1C);

    int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
    int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);

    // get a list of character properties
    Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin);
    List textRuns = chpTable.getTextRuns();

    // iterate through the
    WordTextBuffer finalTextBuf = new WordTextBuffer();
    Iterator runsIt = textRuns.iterator();
    while (runsIt.hasNext()) {
        CHPX chpx = (CHPX) runsIt.next();
        int runStart = chpx.getStart() + fcMin;
        int runEnd = chpx.getEnd() + fcMin;

        if (!isDeleted(chpx.getGrpprl())) {
            String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
            finalTextBuf.append(s);
            if (runEnd >= fcMax) {
                break;
            }
        }
    }

    return finalTextBuf.toString();
}

From source file:com.progdan.doc2txt.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*www  .j  a v a2s.c  o m*/
 *
 * @param in The InputStream representing the Word file.
 */
public String extractText(InputStream in) throws Exception {
    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java

License:Apache License

/**
 * Extracts the client text boxes of a slide.
 * /*from  www  .  j av a 2  s .  co  m*/
 * @param containerTextBox
 * @param offset
 * @param pptdata
 * @param offsetPD
 * @return Hashtable
 * @see TextBox
 */
protected Hashtable/* <Long, TextBox> */ extractTextBoxes(final Hashtable/* <Long, TextBox> */ containerTextBox,
        final int offset, final byte[] pptdata, final long offsetPD) {

    // To hold temporary data
    FilteredStringWriter outStream = new FilteredStringWriter();

    TextBox textBox;

    // Traversing the bytearray up to Presist directory position
    for (int i = offset; i < offsetPD - 20; i++) {
        try {
            // Record info
            // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
            // Record Type
            final long recordType = LittleEndian.getUShort(pptdata, i + 2);
            // Record Size
            final long recordSize = LittleEndian.getUInt(pptdata, i + 4);

            if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
                /*
                 * Record type is of Drawing Group
                 */

                // Total number of objects
                // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
                // 8);
                // currentID = Group ID+number of objects
                long currentID = LittleEndian.getInt(pptdata, i + 12);
                currentID = ((int) (currentID / 1024)) * 1024;

                if (currentID == PPTConstants.PPT_MASTERSLIDE) {
                    // Ignore Master Slide objects
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Ignore master slide.");
                    }
                    i++;
                    continue;
                }

                // Check for the ClientTextBox GroupID existence
                if (containerTextBox.containsKey(new Long(currentID))) {
                    // If exists get Client Textbox Group
                    textBox = (TextBox) containerTextBox.get(new Long(currentID));
                    textBox.setContent("");

                } else {
                    textBox = new TextBox(currentID);
                    containerTextBox.put(new Long(currentID), textBox);
                }

                /*
                 * Iterating the bytearray for TextCharAtoms and TextBytesAtom
                 */
                if ((offsetPD - 20) != recordSize) {
                    // TODO something wrong? Probably an OLE-Object, which we ignore.
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize=" + recordSize);
                    }
                } else {
                    for (int startPos = i + 8; startPos < offsetPD - 20 && startPos < recordSize; startPos++) { // && startPos <
                        // recordSize??
                        try {

                            // Record info
                            // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);

                            // Record Type
                            final long ntype = LittleEndian.getUShort(pptdata, startPos + 2);

                            // Record size
                            // Note that the size doesn't include the 8 byte atom header
                            final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);

                            if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
                                /*
                                 * Break the loop if next GroupID found
                                 */
                                i = startPos - 1;
                                break;
                            } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
                                // TextByteAtom record
                                outStream = new FilteredStringWriter();
                                long ii = 0;
                                for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
                                    // For loop to changed to a function
                                    // if ((ii + 2) >= pptdata.length)
                                    // break; // FIXME
                                    outStream.write((char) (pptdata[(int) ii + 2]));
                                }

                                // Setting the identified text for Current
                                // groupID
                                textBox.setContent(textBox.getContent() + outStream.toString());

                            } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
                                // TextCharAtom record

                                final String strTempContent = new String(pptdata, startPos + 6,
                                        (int) (nsize) + 2);
                                final byte bytes[] = strTempContent.getBytes();
                                if (true) {
                                    outStream = new FilteredStringWriter();
                                    for (int ii = 0; ii < bytes.length - 1; ii += 2) {
                                        // For loop to changed to a function
                                        outStream.write((char) (pptdata[ii + 2]));
                                    }
                                    textBox.setContent(textBox.getContent() + outStream.toString());
                                } else {
                                    // this version is used within POI
                                    String text = StringUtil.getFromCompressedUnicode(bytes, 0, bytes.length);
                                    textBox.setContent(textBox.getContent() + text);
                                }

                            } else {
                                // ignored
                                // if (LOG.isTraceEnabled()) {
                                //   LOG.trace("Ignored atom type: " + type);
                                // }
                            }
                        } catch (Throwable e) {
                            if (LOG.isErrorEnabled()) {
                                LOG.error("extractTextBoxes", e);
                            }
                            break;
                        }
                    }
                }
            } else {
                // Record type is ignored
                // if (LOG.isTraceEnabled()) {
                //   LOG.trace("Ignored record type: " + type);
                // }
            }
        } catch (Throwable ee) {
            if (LOG.isErrorEnabled()) {
                LOG.error("extractClientTextBoxes", ee);
            }
            break;
        }
    }
    return containerTextBox;
}

From source file:org.apache.nutch.parse.msword.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document.//from   ww  w .j  av a 2  s.c  om
 *
 * @param in The InputStream representing the Word file.
 */
protected String extractText(InputStream in) throws Exception {

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.apache.tika.parser.executable.ExecutableParser.java

License:Apache License

/**
 * Parses a DOS or Windows PE file//  w  w w .  j  a  va2 s .  com
 */
public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream, byte[] first4)
        throws TikaException, IOException {
    metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString());
    metadata.set(PLATFORM, PLATFORM_WINDOWS);

    // Skip over the MS-DOS bit
    byte[] msdosSection = new byte[0x3c - 4];
    IOUtils.readFully(stream, msdosSection);

    // Grab the PE header offset
    int peOffset = LittleEndian.readInt(stream);

    // Sanity check - while it may go anywhere, it's normally in the first few kb
    if (peOffset > 4096 || peOffset < 0x3f)
        return;

    // Skip the rest of the MS-DOS stub (if PE), until we reach what should
    //  be the PE header (if this is a PE executable)
    stream.skip(peOffset - 0x40);

    // Read the PE header
    byte[] pe = new byte[24];
    IOUtils.readFully(stream, pe);

    // Check it really is a PE header
    if (pe[0] == (byte) 'P' && pe[1] == (byte) 'E' && pe[2] == 0 && pe[3] == 0) {
        // Good, has a valid PE signature
    } else {
        // Old style MS-DOS
        return;
    }

    // Read the header values
    int machine = LittleEndian.getUShort(pe, 4);
    int numSectors = LittleEndian.getUShort(pe, 6);
    long createdAt = LittleEndian.getInt(pe, 8);
    long symbolTableOffset = LittleEndian.getInt(pe, 12);
    long numSymbols = LittleEndian.getInt(pe, 16);
    int sizeOptHdrs = LittleEndian.getUShort(pe, 20);
    int characteristcs = LittleEndian.getUShort(pe, 22);

    // Turn this into helpful metadata
    Date createdAtD = new Date(createdAt * 1000l);
    metadata.set(Metadata.CREATION_DATE, createdAtD);

    switch (machine) {
    case 0x14c:
        metadata.set(MACHINE_TYPE, MACHINE_x86_32);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;
    case 0x8664:
        metadata.set(MACHINE_TYPE, MACHINE_x86_32);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "64");
        break;
    case 0x200:
        metadata.set(MACHINE_TYPE, MACHINE_IA_64);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "64");
        break;

    case 0x184:
        metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;
    case 0x284:
        metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "64");
        break;

    case 0x1c0:
    case 0x1c4:
        metadata.set(MACHINE_TYPE, MACHINE_ARM);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;

    case 0x268:
        metadata.set(MACHINE_TYPE, MACHINE_M68K);
        metadata.set(ENDIAN, Endian.BIG.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;

    case 0x266:
    case 0x366:
    case 0x466:
        metadata.set(MACHINE_TYPE, MACHINE_MIPS);
        metadata.set(ENDIAN, Endian.BIG.getName());
        metadata.set(ARCHITECTURE_BITS, "16");
        break;
    case 0x162:
    case 0x166:
    case 0x168:
    case 0x169:
        metadata.set(MACHINE_TYPE, MACHINE_MIPS);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "16");
        break;

    case 0x1f0:
    case 0x1f1:
        metadata.set(MACHINE_TYPE, MACHINE_PPC);
        metadata.set(ENDIAN, Endian.LITTLE.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;

    case 0x1a2:
    case 0x1a3:
        metadata.set(MACHINE_TYPE, MACHINE_SH3);
        metadata.set(ENDIAN, Endian.BIG.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;
    case 0x1a6:
        metadata.set(MACHINE_TYPE, MACHINE_SH4);
        metadata.set(ENDIAN, Endian.BIG.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;
    case 0x1a8:
        metadata.set(MACHINE_TYPE, MACHINE_SH3);
        metadata.set(ENDIAN, Endian.BIG.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;

    case 0x9041:
        metadata.set(MACHINE_TYPE, MACHINE_M32R);
        metadata.set(ENDIAN, Endian.BIG.getName());
        metadata.set(ARCHITECTURE_BITS, "32");
        break;

    case 0xebc:
        metadata.set(MACHINE_TYPE, MACHINE_EFI);
        break;

    default:
        metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
        break;
    }
}