List of usage examples for org.apache.poi.poifs.filesystem DocumentInputStream read
@Override
public int read(byte[] b, int off, int len) throws IOException
From source file:com.duroty.lucene.parser.MSPowerPointParser.java
License:Open Source License
/** * DOCUMENT ME!//w w w . j av a2 s.c o m * * @param event DOCUMENT ME! */ public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { if (!event.getName().equalsIgnoreCase("PowerPoint Document")) { return; } DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); byte[] espace = new String("\n\n").getBytes(); for (int i = 0; i < (buffer.length - 20); i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { writer.write(buffer, i + 4 + 1, (int) size + 3); writer.write(espace); i = (i + 4 + 1 + (int) size) - 1; } /*if (sleep > 0) { try { Thread.sleep(sleep); } catch (Exception ex) { } }*/ } } catch (Exception ex) { } }
From source file:com.flexive.extractor.PowerpointExtractor.java
License:Open Source License
@Override public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {/*from w w w . j av a 2s . c om*/ if (event.getName().equalsIgnoreCase("PowerPoint Document")) { DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; //noinspection ResultOfMethodCallIgnored input.read(buffer, 0, input.available()); processContent(buffer, 0, buffer.length); } else if (event.getName().equals("\005SummaryInformation")) { SummaryInformation si = (SummaryInformation) PropertySetFactory.create(event.getStream()); fxsi = new FxSummaryInformation(si); } } catch (Exception ex) { ex.printStackTrace(); } }
From source file:edu.tsinghua.lumaqq.customface.EIPImporter.java
License:Open Source License
/** * ?//from w w w. j a v a 2 s . co m * * @param g * * @return * true? */ public boolean saveEntry(FaceGroup g) { if (g.getId() == FaceConstant.CUSTOM_HEAD_GROUP_ID) return saveCustomHead(g); else { FileOutputStream fos = null; DocumentInputStream dis = null; try { // ? String filename = destDir + g.getId() + '/' + entry.filename; fos = new FileOutputStream(filename); dis = new DocumentInputStream(currentFace); for (int i = 0; i != -1; i = dis.read(buffer, 0, buffer.length)) fos.write(buffer, 0, i); // ? try { ImageLoader loader = new ImageLoader(); loader.load(filename); ImageData data = loader.data[0].scaledTo(20, 20); loader = new ImageLoader(); loader.data = new ImageData[] { data }; loader.save(destDir + g.getId() + '/' + entry.md5 + "fixed.bmp", SWT.IMAGE_BMP); } catch (SWTException e) { return false; } return true; } catch (IOException e) { return false; } finally { try { if (fos != null) fos.close(); if (dis != null) dis.close(); } catch (IOException e) { } } } }
From source file:edu.tsinghua.lumaqq.customface.EIPImporter.java
License:Open Source License
/** * ??//from w w w .j ava2 s. co m * * @return * true?? */ private boolean saveCustomHead(FaceGroup g) { DocumentInputStream dis = null; try { // ? ByteArrayOutputStream baos = new ByteArrayOutputStream(); dis = new DocumentInputStream(currentFace); for (int i = 0; i != -1; i = dis.read(buffer, 0, buffer.length)) baos.write(buffer, 0, i); // ?ImageData ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); ImageData origin = new ImageData(bais); ImageData data = origin.scaledTo(40, 40); // save 40x40 bmp ImageLoader saveLoader = new ImageLoader(); saveLoader.data = new ImageData[] { data }; saveLoader.save(destDir + g.getId() + '/' + entry.md5 + ".bmp", SWT.IMAGE_BMP); // save 20x20 bmp data = origin.scaledTo(20, 20); saveLoader = new ImageLoader(); saveLoader.data = new ImageData[] { data }; saveLoader.save(destDir + g.getId() + '/' + entry.md5 + "fixed.bmp", SWT.IMAGE_BMP); return true; } catch (SWTException e) { return false; } catch (IOException e) { return false; } finally { try { if (dis != null) dis.close(); } catch (IOException e) { } } }
From source file:lius.index.powerpoint.PPTIndexer.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {/*w w w . j a v a 2 s . c o m*/ if (!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008L) { writer.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception ex) { logger.error(ex.getMessage()); } }
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Reads the internal PowerPoint document stream. * /*from w w w.jav a2s. com*/ * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent) */ public void processPOIFSReaderEvent(final POIFSReaderEvent event) { if (event == null || event.getName() == null || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) { if (LOG.isWarnEnabled()) { LOG.warn("Stream not processed. It is not a PowerPoint document: : " + event.getName()); } return; } try { final DocumentInputStream dis = event.getStream(); final byte pptdata[] = new byte[dis.available()]; dis.read(pptdata, 0, dis.available()); int offset = 0; long offsetPD = 0; /* * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text * in all PlaceHolders to hold PPTClientTextBox objects for mapping into * Slide Objects */ Hashtable/* <Long, TextBox> */ containerTextBox = new Hashtable/* * <Long, * TextBox> */(); // Traverse ByteArray to identiy edit paths of ClientTextBoxes long n = pptdata.length - 20; for (long i = 0; i < n; i++) { final long type = LittleEndian.getUShort(pptdata, (int) i + 2); // final long size = LittleEndian.getUInt(pptdata, (int) i + 4); if (PPTConstants.PPT_ATOM_USEREDIT == type) { /* * Checking the Record Header (UserEditAtom) */ // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8); // final long version = LittleEndian.getUInt(pptdata, (int) i + 12); offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16); offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20); /* * Call to extract ClientTextBox text in each UserEditAtom */ containerTextBox = extractTextBoxes(containerTextBox, offset, pptdata, offsetPD); } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type); // } } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type); // } } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type); // } } else { // no action // if (LOG.isTraceEnabled()) { // LOG.trace("type not handled: " + type); // } } } final List/* <PPTSlide> */ slides = extractSlides(offset, pptdata, offsetPD); if (slides.size() == 0) { if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); } } else { Slide slide = (Slide) slides.get(slides.size() - 1); for (Enumeration enumeration = containerTextBox.elements(); enumeration.hasMoreElements();) { final TextBox textBox = (TextBox) enumeration.nextElement(); slide.addContent(textBox.getContent()); } /* * Merging TextBox data with Slide Data Printing the text from Slides * vector object. */ List scontent; for (int i = 0; i < slides.size(); i++) { slide = (Slide) slides.get(i); scontent = slide.getContent(); String contentText; for (int j = 0; j < scontent.size(); j++) { contentText = scontent.get(j).toString(); this.buf.append(contentText); // to avoid concatinated words we add a blank additional if (contentText.length() > 0 && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) { this.buf.append(" "); } } } } } catch (Throwable ex) { // because of not killing complete crawling all Throwables are catched. if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); } } }
From source file:org.apache.slide.extractor.MSPowerPointExtractor.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {/*from ww w .j a va 2s. c o m*/ if (!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { writer.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception e) { } }
From source file:org.jlibrary.core.search.extraction.PowerPointExtractor.java
License:Open Source License
/** * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent) *//*from www. jav a 2 s .c om*/ public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { // super implementation handles document summary super.processPOIFSReaderEvent(event); // make sue this is a PPT document if (!event.getName().startsWith(POWERPOINT_EVENT_NAME)) { return; } DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { int type = LittleEndian.getUShort(buffer, i + 2); int size = (int) LittleEndian.getUInt(buffer, i + 4) + 3; String encoding = null; switch (type) { case PPT_TEXTBYTE_ATOM: // this pice is single-byte encoded, let's assume Cp1252 since this is most likley // anyone who knows how to find out the "right" encoding - please email me encoding = ENCODING_CP1252; case PPT_TEXTCHAR_ATOM: if (encoding == null) { // this piece is double-byte encoded, use UTF-16 encoding = ENCODING_UTF16; } int start = i + 4 + 1; int end = start + size; byte[] buf = new byte[size]; System.arraycopy(buffer, start, buf, 0, buf.length); m_buffer.append(new String(buf, encoding)); i = end; default: // noop } } } catch (Exception e) { // ignore } }
From source file:org.opencms.search.extractors.CmsExtractorMsPowerPoint.java
License:Open Source License
/** * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent) */// w w w . jav a 2 s . c o m public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { // super implementation handles document summary super.processPOIFSReaderEvent(event); // make sue this is a PPT document if (!event.getName().startsWith(POWERPOINT_EVENT_NAME)) { return; } DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { int type = LittleEndian.getUShort(buffer, i + 2); int size = (int) LittleEndian.getUInt(buffer, i + 4) + 3; String encoding = null; switch (type) { case PPT_TEXTBYTE_ATOM: // this pice is single-byte encoded, let's assume Cp1252 since this is most likley // anyone who knows how to find out the "right" encoding - please email me encoding = ENCODING_CP1252; case PPT_TEXTCHAR_ATOM: if (encoding == null) { // this piece is double-byte encoded, use UTF-16 encoding = ENCODING_UTF16; } int start = i + 4 + 1; int end = start + size; byte[] buf = new byte[size]; System.arraycopy(buffer, start, buf, 0, buf.length); m_buffer.append(CmsEncoder.createString(buf, encoding)); i = end; default: // noop } } } catch (RuntimeException e) { // ignore } catch (Exception e) { // ignore } }
From source file:org.sakaiproject.search.component.adapter.contenthosting.PPTContentDigester.java
License:Educational Community License
public String getContent(ContentResource contentResource) { if (contentResource == null) { throw new RuntimeException("Null contentResource passed to getContent"); }/*from w w w . ja v a 2s . c om*/ InputStream contentStream = null; try { // this is informed by the text extractors in Jackrabbit final ByteArrayOutputStream os = new ByteArrayOutputStream(); POIFSReaderListener listener = new POIFSReaderListener() { public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { if (!event.getName().equalsIgnoreCase("PowerPoint Document")) { return; } DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { os.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception e) { log.debug(e); } } }; POIFSReader reader = new POIFSReader(); reader.registerListener(listener); contentStream = contentResource.streamContent(); reader.read(contentStream); os.flush(); StringBuilder sb = new StringBuilder(); SearchUtils.appendCleanString(new String(os.toByteArray(), "UTF-8"), sb); return sb.toString(); } catch (Exception e) { throw new RuntimeException("Failed to read content for indexing ", e); } finally { if (contentStream != null) { try { contentStream.close(); } catch (IOException e) { log.debug(e); } } } }