Example usage for org.apache.poi.poifs.filesystem DocumentInputStream read

List of usage examples for org.apache.poi.poifs.filesystem DocumentInputStream read

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem DocumentInputStream read.

Prototype

@Override
    public int read(byte[] b, int off, int len) throws IOException 

Source Link

Usage

From source file:com.duroty.lucene.parser.MSPowerPointParser.java

License:Open Source License

/**
 * DOCUMENT ME!//w w  w . j av a2  s.c  o m
 *
 * @param event DOCUMENT ME!
 */
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
    try {
        if (!event.getName().equalsIgnoreCase("PowerPoint Document")) {
            return;
        }

        DocumentInputStream input = event.getStream();

        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());

        byte[] espace = new String("\n\n").getBytes();

        for (int i = 0; i < (buffer.length - 20); i++) {
            long type = LittleEndian.getUShort(buffer, i + 2);
            long size = LittleEndian.getUInt(buffer, i + 4);

            if (type == 4008) {
                writer.write(buffer, i + 4 + 1, (int) size + 3);
                writer.write(espace);
                i = (i + 4 + 1 + (int) size) - 1;
            }

            /*if (sleep > 0) {
                try {
                    Thread.sleep(sleep);
                } catch (Exception ex) {
                }
            }*/
        }
    } catch (Exception ex) {
    }
}

From source file:com.flexive.extractor.PowerpointExtractor.java

License:Open Source License

@Override
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
    try {/*from   w  w w  .  j  av  a 2s  . c  om*/
        if (event.getName().equalsIgnoreCase("PowerPoint Document")) {
            DocumentInputStream input = event.getStream();
            byte[] buffer = new byte[input.available()];
            //noinspection ResultOfMethodCallIgnored
            input.read(buffer, 0, input.available());
            processContent(buffer, 0, buffer.length);
        } else if (event.getName().equals("\005SummaryInformation")) {
            SummaryInformation si = (SummaryInformation) PropertySetFactory.create(event.getStream());
            fxsi = new FxSummaryInformation(si);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:edu.tsinghua.lumaqq.customface.EIPImporter.java

License:Open Source License

/**
 * ?//from   w  w w. j  a  v  a 2  s  . co  m
 * 
 * @param g
 *       
 * @return
 *       true?
 */
public boolean saveEntry(FaceGroup g) {
    if (g.getId() == FaceConstant.CUSTOM_HEAD_GROUP_ID)
        return saveCustomHead(g);
    else {
        FileOutputStream fos = null;
        DocumentInputStream dis = null;
        try {
            // ?
            String filename = destDir + g.getId() + '/' + entry.filename;
            fos = new FileOutputStream(filename);
            dis = new DocumentInputStream(currentFace);
            for (int i = 0; i != -1; i = dis.read(buffer, 0, buffer.length))
                fos.write(buffer, 0, i);

            // ?
            try {
                ImageLoader loader = new ImageLoader();
                loader.load(filename);
                ImageData data = loader.data[0].scaledTo(20, 20);
                loader = new ImageLoader();
                loader.data = new ImageData[] { data };
                loader.save(destDir + g.getId() + '/' + entry.md5 + "fixed.bmp", SWT.IMAGE_BMP);
            } catch (SWTException e) {
                return false;
            }

            return true;
        } catch (IOException e) {
            return false;
        } finally {
            try {
                if (fos != null)
                    fos.close();
                if (dis != null)
                    dis.close();
            } catch (IOException e) {
            }
        }
    }
}

From source file:edu.tsinghua.lumaqq.customface.EIPImporter.java

License:Open Source License

/**
 * ??//from   w w w .j ava2  s. co  m
 * 
  * @return
  *       true??
 */
private boolean saveCustomHead(FaceGroup g) {
    DocumentInputStream dis = null;
    try {
        // ?
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        dis = new DocumentInputStream(currentFace);
        for (int i = 0; i != -1; i = dis.read(buffer, 0, buffer.length))
            baos.write(buffer, 0, i);

        // ?ImageData
        ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
        ImageData origin = new ImageData(bais);
        ImageData data = origin.scaledTo(40, 40);

        // save 40x40 bmp
        ImageLoader saveLoader = new ImageLoader();
        saveLoader.data = new ImageData[] { data };
        saveLoader.save(destDir + g.getId() + '/' + entry.md5 + ".bmp", SWT.IMAGE_BMP);

        // save 20x20 bmp
        data = origin.scaledTo(20, 20);
        saveLoader = new ImageLoader();
        saveLoader.data = new ImageData[] { data };
        saveLoader.save(destDir + g.getId() + '/' + entry.md5 + "fixed.bmp", SWT.IMAGE_BMP);

        return true;
    } catch (SWTException e) {
        return false;
    } catch (IOException e) {
        return false;
    } finally {
        try {
            if (dis != null)
                dis.close();
        } catch (IOException e) {
        }
    }
}

From source file:lius.index.powerpoint.PPTIndexer.java

License:Apache License

public void processPOIFSReaderEvent(POIFSReaderEvent event) {
    try {/*w  w w . j  a  v a 2  s .  c  o  m*/
        if (!event.getName().equalsIgnoreCase("PowerPoint Document"))
            return;
        DocumentInputStream input = event.getStream();
        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());
        for (int i = 0; i < buffer.length - 20; i++) {
            long type = LittleEndian.getUShort(buffer, i + 2);
            long size = LittleEndian.getUInt(buffer, i + 4);
            if (type == 4008L) {
                writer.write(buffer, i + 4 + 1, (int) size + 3);
                i = i + 4 + 1 + (int) size - 1;
            }
        }
    } catch (Exception ex) {
        logger.error(ex.getMessage());
    }
}

From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java

License:Apache License

/**
 * Reads the internal PowerPoint document stream.
 * /*from  w  w w.jav a2s. com*/
 * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
 */
public void processPOIFSReaderEvent(final POIFSReaderEvent event) {

    if (event == null || event.getName() == null
            || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Stream not processed. It is not a PowerPoint document: : " + event.getName());
        }
        return;
    }

    try {
        final DocumentInputStream dis = event.getStream();
        final byte pptdata[] = new byte[dis.available()];
        dis.read(pptdata, 0, dis.available());
        int offset = 0;
        long offsetPD = 0;

        /*
         * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
         * in all PlaceHolders to hold PPTClientTextBox objects for mapping into
         * Slide Objects
         */
        Hashtable/* <Long, TextBox> */ containerTextBox = new Hashtable/*
                                                                        * <Long,
                                                                        * TextBox>
                                                                        */();
        // Traverse ByteArray to identiy edit paths of ClientTextBoxes
        long n = pptdata.length - 20;
        for (long i = 0; i < n; i++) {

            final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
            // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);

            if (PPTConstants.PPT_ATOM_USEREDIT == type) {
                /*
                 * Checking the Record Header (UserEditAtom)
                 */
                // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
                // final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
                offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
                offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);

                /*
                 * Call to extract ClientTextBox text in each UserEditAtom
                 */
                containerTextBox = extractTextBoxes(containerTextBox, offset, pptdata, offsetPD);
            } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
                // if (LOG.isTraceEnabled()) {
                //   LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
                // }
            } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
                // if (LOG.isTraceEnabled()) {
                //   LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
                // }
            } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
                // if (LOG.isTraceEnabled()) {
                //   LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
                // }
            } else {
                // no action
                // if (LOG.isTraceEnabled()) {
                //   LOG.trace("type not handled: " + type);
                // }
            }
        }

        final List/* <PPTSlide> */ slides = extractSlides(offset, pptdata, offsetPD);

        if (slides.size() == 0) {
            if (LOG.isInfoEnabled()) {
                LOG.info("No slides extracted!");
            }

        } else {
            Slide slide = (Slide) slides.get(slides.size() - 1);

            for (Enumeration enumeration = containerTextBox.elements(); enumeration.hasMoreElements();) {
                final TextBox textBox = (TextBox) enumeration.nextElement();
                slide.addContent(textBox.getContent());
            }

            /*
             * Merging TextBox data with Slide Data Printing the text from Slides
             * vector object.
             */
            List scontent;
            for (int i = 0; i < slides.size(); i++) {
                slide = (Slide) slides.get(i);
                scontent = slide.getContent();
                String contentText;

                for (int j = 0; j < scontent.size(); j++) {
                    contentText = scontent.get(j).toString();
                    this.buf.append(contentText);

                    // to avoid concatinated words we add a blank additional
                    if (contentText.length() > 0
                            && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
                        this.buf.append(" ");
                    }
                }
            }
        }
    } catch (Throwable ex) {
        // because of not killing complete crawling all Throwables are catched.
        if (LOG.isErrorEnabled()) {
            LOG.error("processPOIFSReaderEvent", ex);
        }
    }
}

From source file:org.apache.slide.extractor.MSPowerPointExtractor.java

License:Apache License

public void processPOIFSReaderEvent(POIFSReaderEvent event) {
    try {/*from   ww w .j  a va 2s.  c  o m*/
        if (!event.getName().equalsIgnoreCase("PowerPoint Document"))
            return;

        DocumentInputStream input = event.getStream();

        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());

        for (int i = 0; i < buffer.length - 20; i++) {
            long type = LittleEndian.getUShort(buffer, i + 2);
            long size = LittleEndian.getUInt(buffer, i + 4);

            if (type == 4008) {
                writer.write(buffer, i + 4 + 1, (int) size + 3);
                i = i + 4 + 1 + (int) size - 1;

            }
        }
    } catch (Exception e) {

    }
}

From source file:org.jlibrary.core.search.extraction.PowerPointExtractor.java

License:Open Source License

/**
 * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
 *//*from www.  jav a 2 s .c  om*/
public void processPOIFSReaderEvent(POIFSReaderEvent event) {

    try {

        // super implementation handles document summary
        super.processPOIFSReaderEvent(event);

        // make sue this is a PPT document
        if (!event.getName().startsWith(POWERPOINT_EVENT_NAME)) {
            return;
        }

        DocumentInputStream input = event.getStream();
        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());

        for (int i = 0; i < buffer.length - 20; i++) {
            int type = LittleEndian.getUShort(buffer, i + 2);
            int size = (int) LittleEndian.getUInt(buffer, i + 4) + 3;

            String encoding = null;
            switch (type) {
            case PPT_TEXTBYTE_ATOM:
                // this pice is single-byte encoded, let's assume Cp1252 since this is most likley
                // anyone who knows how to find out the "right" encoding - please email me
                encoding = ENCODING_CP1252;
            case PPT_TEXTCHAR_ATOM:
                if (encoding == null) {
                    // this piece is double-byte encoded, use UTF-16
                    encoding = ENCODING_UTF16;
                }
                int start = i + 4 + 1;
                int end = start + size;

                byte[] buf = new byte[size];
                System.arraycopy(buffer, start, buf, 0, buf.length);

                m_buffer.append(new String(buf, encoding));
                i = end;
            default:
                // noop                                           
            }
        }
    } catch (Exception e) {
        // ignore
    }
}

From source file:org.opencms.search.extractors.CmsExtractorMsPowerPoint.java

License:Open Source License

/**
 * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
 */// w  w  w  . jav a 2 s . c o  m
public void processPOIFSReaderEvent(POIFSReaderEvent event) {

    try {

        // super implementation handles document summary
        super.processPOIFSReaderEvent(event);

        // make sue this is a PPT document
        if (!event.getName().startsWith(POWERPOINT_EVENT_NAME)) {
            return;
        }

        DocumentInputStream input = event.getStream();
        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());

        for (int i = 0; i < buffer.length - 20; i++) {
            int type = LittleEndian.getUShort(buffer, i + 2);
            int size = (int) LittleEndian.getUInt(buffer, i + 4) + 3;

            String encoding = null;
            switch (type) {
            case PPT_TEXTBYTE_ATOM:
                // this pice is single-byte encoded, let's assume Cp1252 since this is most likley
                // anyone who knows how to find out the "right" encoding - please email me
                encoding = ENCODING_CP1252;
            case PPT_TEXTCHAR_ATOM:
                if (encoding == null) {
                    // this piece is double-byte encoded, use UTF-16
                    encoding = ENCODING_UTF16;
                }
                int start = i + 4 + 1;
                int end = start + size;

                byte[] buf = new byte[size];
                System.arraycopy(buffer, start, buf, 0, buf.length);

                m_buffer.append(CmsEncoder.createString(buf, encoding));
                i = end;
            default:
                // noop                                           
            }
        }
    } catch (RuntimeException e) {
        // ignore
    } catch (Exception e) {
        // ignore
    }
}

From source file:org.sakaiproject.search.component.adapter.contenthosting.PPTContentDigester.java

License:Educational Community License

public String getContent(ContentResource contentResource) {
    if (contentResource == null) {
        throw new RuntimeException("Null contentResource passed to getContent");
    }/*from  w w  w . ja v a 2s  . c om*/
    InputStream contentStream = null;

    try {
        // this is informed by the text extractors in Jackrabbit

        final ByteArrayOutputStream os = new ByteArrayOutputStream();

        POIFSReaderListener listener = new POIFSReaderListener() {
            public void processPOIFSReaderEvent(POIFSReaderEvent event) {
                try {
                    if (!event.getName().equalsIgnoreCase("PowerPoint Document")) {
                        return;
                    }
                    DocumentInputStream input = event.getStream();
                    byte[] buffer = new byte[input.available()];
                    input.read(buffer, 0, input.available());
                    for (int i = 0; i < buffer.length - 20; i++) {
                        long type = LittleEndian.getUShort(buffer, i + 2);
                        long size = LittleEndian.getUInt(buffer, i + 4);
                        if (type == 4008) {
                            os.write(buffer, i + 4 + 1, (int) size + 3);
                            i = i + 4 + 1 + (int) size - 1;
                        }
                    }
                } catch (Exception e) {
                    log.debug(e);
                }
            }
        };

        POIFSReader reader = new POIFSReader();
        reader.registerListener(listener);
        contentStream = contentResource.streamContent();
        reader.read(contentStream);
        os.flush();
        StringBuilder sb = new StringBuilder();
        SearchUtils.appendCleanString(new String(os.toByteArray(), "UTF-8"), sb);
        return sb.toString();
    } catch (Exception e) {
        throw new RuntimeException("Failed to read content for indexing ", e);
    } finally {
        if (contentStream != null) {
            try {
                contentStream.close();
            } catch (IOException e) {
                log.debug(e);
            }
        }
    }
}