Example usage for org.apache.poi.poifs.filesystem DocumentInputStream read

List of usage examples for org.apache.poi.poifs.filesystem DocumentInputStream read

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem DocumentInputStream read.

Prototype

@Override
    public int read(byte[] b) throws IOException 

Source Link

Usage

From source file:com.argo.hwp.v5.HwpTextExtractorV5.java

License:Open Source License

/**
 * HWP? FileHeader /* w  w w  . j  av  a 2s  . c om*/
 * 
 * @param fs
 * @return
 * @throws IOException
 */
private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException {
    DirectoryNode root = fs.getRoot();

    // ??? p.18

    // FileHeader  
    Entry headerEntry = root.getEntry("FileHeader");
    if (!headerEntry.isDocumentEntry())
        return null;

    //  ?
    byte[] header = new byte[256]; // FileHeader ? 256
    DocumentInputStream headerStream = new DocumentInputStream((DocumentEntry) headerEntry);
    try {
        int read = headerStream.read(header);
        if (read != 256
                || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(header, 0, HWP_V5_SIGNATURE.length)))
            return null;
    } finally {
        headerStream.close();
    }

    FileHeader fileHeader = new FileHeader();

    // . debug
    fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(header, 32));
    long flags = LittleEndian.getUInt(header, 36);
    log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));

    fileHeader.compressed = (flags & 0x01) == 0x01;
    fileHeader.encrypted = (flags & 0x02) == 0x02;
    fileHeader.viewtext = (flags & 0x04) == 0x04;

    return fileHeader;
}

From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();//  w  w w. j  a va2s.  co m

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();/*from ww w  .j  a  va2 s. c o m*/

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header, tpt);
    }
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);
    // load our text pieces and our character runs

    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.progdan.doc2txt.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*from w  w w .j  a v a  2 s. c o m*/
 *
 * @param in The InputStream representing the Word file.
 */
public String extractText(InputStream in) throws Exception {
    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:net.freeutils.tnef.msg.Msg.java

License:Open Source License

protected static RawInputStream toRawInputStream(DocumentEntry entry) throws IOException {
    DocumentInputStream dis = new DocumentInputStream(entry);
    ByteArrayOutputStream bais = new ByteArrayOutputStream(dis.available());
    try {/*from   w  w w .  j av a 2 s.c  o m*/
        byte[] bytes = new byte[4096];
        int count;
        while ((count = dis.read(bytes)) > -1)
            bais.write(bytes, 0, count);
    } finally {
        dis.close();
    }
    return new RawInputStream(bais.toByteArray());
}

From source file:net.sf.mmm.content.parser.impl.poi.ContentParserPpt.java

License:Apache License

/**
 * {@inheritDoc}//www.ja va 2s  .  com
 */
@Override
protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options)
        throws Exception {

    // PowerPointExtractor pptExtractor = new PowerPointExtractor(poiFs);
    // return pptExtractor.getText();

    DocumentInputStream docStream = poiFs.createDocumentInputStream(POIFS_POWERPOINT_DOC);

    int length = docStream.available();
    int maximumBufferSize = options.getMaximumBufferSize();
    if (maximumBufferSize < length) {
        length = maximumBufferSize;
    }
    int capacity = length / 10;
    StringBuffer textBuffer = new StringBuffer(capacity);
    byte[] buffer = new byte[length];
    docStream.read(buffer);
    docStream.close();
    extractRecursive(buffer, 0, length, textBuffer);
    return textBuffer.toString();
}

From source file:net.sf.mpxj.utility.MppClean.java

License:Open Source License

/**
 * Extracts a block of data from the MPP file, and iterates through the map
 * of find/replace pairs to make the data anonymous.
 * //from w  w w .ja  va 2s. c o  m
 * @param parentDirectory parent directory object
 * @param fileName target file name
 * @param replacements find/replace data
 * @param unicode true for double byte text
 * @throws IOException
 */
private void processReplacements(DirectoryEntry parentDirectory, String fileName,
        Map<String, String> replacements, boolean unicode) throws IOException {
    //
    // Populate a list of keys and sort into descending order of length
    //
    List<String> keys = new ArrayList<String>(replacements.keySet());
    Collections.sort(keys, new Comparator<String>() {
        @Override
        public int compare(String o1, String o2) {
            return (o2.length() - o1.length());
        }
    });

    //
    // Extract the raw file data
    //
    DocumentEntry targetFile = (DocumentEntry) parentDirectory.getEntry(fileName);
    DocumentInputStream dis = new DocumentInputStream(targetFile);
    int dataSize = dis.available();
    byte[] data = new byte[dataSize];
    dis.read(data);

    //
    // Replace the text
    //
    for (String findText : keys) {
        String replaceText = replacements.get(findText);
        replaceData(data, findText, replaceText, unicode);
    }

    //
    // Remove the document entry
    //
    targetFile.delete();

    //
    // Replace it with a new one
    //
    parentDirectory.createDocument(fileName, new ByteArrayInputStream(data));
}

From source file:net.sf.mpxj.utility.MppCleanUtility.java

License:Open Source License

/**
 * Extracts a block of data from the MPP file, and iterates through the map
 * of find/replace pairs to make the data anonymous.
 * //from  w  ww.  j  a  v a2s .c  om
 * @param parentDirectory parent directory object
 * @param fileName target file name
 * @param replacements find/replace data
 * @param unicode true for double byte text
 * @throws IOException
 */
private void processReplacements(DirectoryEntry parentDirectory, String fileName,
        Map<String, String> replacements, boolean unicode) throws IOException {
    //
    // Populate a list of keys and sort into descending order of length
    //
    List<String> keys = new ArrayList<String>(replacements.keySet());
    Collections.sort(keys, new Comparator<String>() {
        @Override
        public int compare(String o1, String o2) {
            return (o2.length() - o1.length());
        }
    });

    //
    // Extract the raw file data
    //
    DocumentEntry targetFile = (DocumentEntry) parentDirectory.getEntry(fileName);
    DocumentInputStream dis = new DocumentInputStream(targetFile);
    int dataSize = dis.available();
    byte[] data = new byte[dataSize];
    dis.read(data);
    dis.close();

    //
    // Replace the text
    //
    for (String findText : keys) {
        String replaceText = replacements.get(findText);
        replaceData(data, findText, replaceText, unicode);
    }

    //
    // Remove the document entry
    //
    targetFile.delete();

    //
    // Replace it with a new one
    //
    parentDirectory.createDocument(fileName, new ByteArrayInputStream(data));
}

From source file:nz.govt.natlib.adapter.excel.ExcelAdapter.java

License:Apache License

public void readDocument(POIFSFileSystem fs, DocumentEntry doc) throws Exception {
    // load file system
    DocumentInputStream stream = new DocumentInputStream(doc);

    if (stream.available() > 256) {
        return;//ww  w  . j a v  a2 s  . c om
    }

    // process data from stream
    byte[] content = new byte[stream.available()];
    stream.read(content);
    stream.close();

    for (int i = 0; i < content.length; i++) {
        int c = content[i];
        if (c < 0) {
            c = 0x100 + c;
        }
    }

}

From source file:nz.govt.natlib.adapter.powerpoint.PowerPointAdapter.java

License:Apache License

public void readDocument(POIFSFileSystem fs, DocumentEntry doc) throws Exception {
    // load file system
    DocumentInputStream stream = new DocumentInputStream(doc);

    if (stream.available() > 256) {
        return;/*from w  ww . ja  v a 2  s.c om*/
    }

    // process data from stream
    byte[] content = new byte[stream.available()];
    stream.read(content);
    stream.close();

    for (int i = 0; i < content.length; i++) {
        int c = content[i];
        if (c < 0) {
            c = 0x100 + c;
        }
        System.out.println(i + ", " + Integer.toString(c) + "\t" + Integer.toHexString(c) + "\t" + (char) c);
    }

}