List of usage examples for org.apache.poi.poifs.eventfilesystem POIFSReaderEvent getName
public String getName()
From source file:com.duroty.lucene.parser.MSPowerPointParser.java
License:Open Source License
/** * DOCUMENT ME!// www . ja v a2 s. com * * @param event DOCUMENT ME! */ public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { if (!event.getName().equalsIgnoreCase("PowerPoint Document")) { return; } DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); byte[] espace = new String("\n\n").getBytes(); for (int i = 0; i < (buffer.length - 20); i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { writer.write(buffer, i + 4 + 1, (int) size + 3); writer.write(espace); i = (i + 4 + 1 + (int) size) - 1; } /*if (sleep > 0) { try { Thread.sleep(sleep); } catch (Exception ex) { } }*/ } } catch (Exception ex) { } }
From source file:com.flexive.extractor.PowerpointExtractor.java
License:Open Source License
@Override public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {/*from www .j a v a 2 s. c o m*/ if (event.getName().equalsIgnoreCase("PowerPoint Document")) { DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; //noinspection ResultOfMethodCallIgnored input.read(buffer, 0, input.available()); processContent(buffer, 0, buffer.length); } else if (event.getName().equals("\005SummaryInformation")) { SummaryInformation si = (SummaryInformation) PropertySetFactory.create(event.getStream()); fxsi = new FxSummaryInformation(si); } } catch (Exception ex) { ex.printStackTrace(); } }
From source file:com.frameworkset.platform.cms.searchmanager.extractors.A_CmsTextExtractorMsOfficeBase.java
License:Open Source License
/** * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent) */// w ww . j a v a 2s.c o m public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { if ((m_summary == null) && event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) { m_summary = (SummaryInformation) PropertySetFactory.create(event.getStream()); return; } if ((m_documentSummary == null) && event.getName().startsWith(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) { m_documentSummary = (DocumentSummaryInformation) PropertySetFactory.create(event.getStream()); return; } } catch (Exception e) { // ignore } }
From source file:com.knowgate.ole.OLEListener.java
License:Open Source License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {// ww w . ja v a 2s. c om si = (SummaryInformation) PropertySetFactory.create(event.getStream()); } catch (MarkUnsupportedException ex) { if (DebugFile.trace) DebugFile.writeln("com.knowgate.ole.OLEListener MarkUnsupportedException " + event.getPath() + event.getName() + " " + ex.getMessage()); } catch (NoPropertySetStreamException ex) { if (DebugFile.trace) DebugFile.writeln("com.knowgate.ole.OLEListener NoPropertySetStreamException " + event.getPath() + event.getName() + " " + ex.getMessage()); } catch (IOException ex) { if (DebugFile.trace) DebugFile.writeln("com.knowgate.ole.OLEListener IOException " + event.getPath() + event.getName() + " " + ex.getMessage()); } }
From source file:com.villemos.ispace.aperture.enricher.MicrosoftPropertyReader.java
License:Open Source License
public void processPOIFSReaderEvent(final POIFSReaderEvent event) { PropertySet ps = null;/*from w ww . j a v a2 s.com*/ try { ps = PropertySetFactory.create(event.getStream()); } catch (NoPropertySetStreamException ex) { LOG.debug("No property set stream: \"" + event.getPath() + event.getName() + "\""); return; } catch (Exception ex) { LOG.error("Exception while processing microsoft property set " + ex); } /* Print the name of the property set stream: */ LOG.debug("Property set stream \"" + event.getPath() + event.getName() + "\":"); /* Print the list of sections: */ List<Section> sections = ps.getSections(); int nr = 0; for (Section sec : sections) { String s = HexDump.dump(sec.getFormatID().getBytes(), 0L, 0); s = s.substring(0, s.length() - 1); /* Print the number of properties in this section. */ int propertyCount = sec.getPropertyCount(); /* Print the properties: */ Property[] properties = sec.getProperties(); for (int i2 = 0; i2 < properties.length; i2++) { /* Print a single property: */ Property p = properties[i2]; long id = p.getID(); long type = p.getType(); Object value = p.getValue(); String propertyName = sec.getPIDString(id); if (msProperties.containsKey(propertyName) == false) { String valueStr = value.toString(); if (valueStr.equals("") == false) { msProperties.put(propertyName, valueStr); } } } } }
From source file:lius.index.powerpoint.PPTIndexer.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {/*from w w w .j av a 2s.c o m*/ if (!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008L) { writer.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception ex) { logger.error(ex.getMessage()); } }
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Reads the internal PowerPoint document stream. * /*from ww w. j a v a 2 s.co m*/ * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent) */ public void processPOIFSReaderEvent(final POIFSReaderEvent event) { if (event == null || event.getName() == null || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) { if (LOG.isWarnEnabled()) { LOG.warn("Stream not processed. It is not a PowerPoint document: : " + event.getName()); } return; } try { final DocumentInputStream dis = event.getStream(); final byte pptdata[] = new byte[dis.available()]; dis.read(pptdata, 0, dis.available()); int offset = 0; long offsetPD = 0; /* * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text * in all PlaceHolders to hold PPTClientTextBox objects for mapping into * Slide Objects */ Hashtable/* <Long, TextBox> */ containerTextBox = new Hashtable/* * <Long, * TextBox> */(); // Traverse ByteArray to identiy edit paths of ClientTextBoxes long n = pptdata.length - 20; for (long i = 0; i < n; i++) { final long type = LittleEndian.getUShort(pptdata, (int) i + 2); // final long size = LittleEndian.getUInt(pptdata, (int) i + 4); if (PPTConstants.PPT_ATOM_USEREDIT == type) { /* * Checking the Record Header (UserEditAtom) */ // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8); // final long version = LittleEndian.getUInt(pptdata, (int) i + 12); offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16); offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20); /* * Call to extract ClientTextBox text in each UserEditAtom */ containerTextBox = extractTextBoxes(containerTextBox, offset, pptdata, offsetPD); } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type); // } } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type); // } } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type); // } } else { // no action // if (LOG.isTraceEnabled()) { // LOG.trace("type not handled: " + type); // } } } final List/* <PPTSlide> */ slides = extractSlides(offset, pptdata, offsetPD); if (slides.size() == 0) { if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); } } else { Slide slide = (Slide) slides.get(slides.size() - 1); for (Enumeration enumeration = containerTextBox.elements(); enumeration.hasMoreElements();) { final TextBox textBox = (TextBox) enumeration.nextElement(); slide.addContent(textBox.getContent()); } /* * Merging TextBox data with Slide Data Printing the text from Slides * vector object. */ List scontent; for (int i = 0; i < slides.size(); i++) { slide = (Slide) slides.get(i); scontent = slide.getContent(); String contentText; for (int j = 0; j < scontent.size(); j++) { contentText = scontent.get(j).toString(); this.buf.append(contentText); // to avoid concatinated words we add a blank additional if (contentText.length() > 0 && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) { this.buf.append(" "); } } } } } catch (Throwable ex) { // because of not killing complete crawling all Throwables are catched. if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); } } }
From source file:org.apache.slide.extractor.MSPowerPointExtractor.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {/*from www. ja va 2 s. c om*/ if (!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { writer.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception e) { } }
From source file:org.ddt.listener.dsi.DocumentSummaryInfoListener.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { log.log(Level.FINEST, "reading {0}{1}", new Object[] { event.getPath(), event.getName() }); DocumentInputStream is = event.getStream(); try {// w w w .j a v a 2 s . co m PropertySet ps = PropertySetFactory.create(is); if (!(ps instanceof DocumentSummaryInformation)) return; Property docparts = null; Property headings = null; for (Property prop : ps.getProperties()) { if (prop.getID() == PropertyIDMap.PID_HEADINGPAIR) // == 12 headings = prop; else if (prop.getID() == PropertyIDMap.PID_DOCPARTS) docparts = prop; } if (docparts == null) { log.log(Level.FINE, "No DOCPARTS section"); return; } if (headings == null) return; HeadingPairVector hdv = new HeadingPairVector((byte[]) headings.getValue(), 0); StringVector docpartsVector = new StringVector((byte[]) docparts.getValue(), 0, docparts.getType()); HeadingPairProperty linkHeader = hdv.getHeadingPairByName("Links"); //*NOT* null terminated if (linkHeader == null) { log.log(Level.INFO, "No 'Links' header found."); return; } else { log.log(Level.FINEST, "Found {0} link parts", linkHeader.getPartsCount()); } //need to iterate through all of the ones if there's more than one //docpart for the header. int part = linkHeader.getOffset(); for (int i = 0; i < linkHeader.getPartsCount(); i++) { String url = docpartsVector.get(part).getValue(); log.log(Level.FINEST, "adding {0} to list of links.", url); url = url.trim(); Link l = new Link(3); l.addUnkownPath(url); this.add(l); part++; } } catch (NoPropertySetStreamException ex) { log.log(Level.INFO, "Not a PropertySetStream {0}{1}", new Object[] { event.getPath(), event.getName() }); } catch (MarkUnsupportedException ex) { log.log(Level.INFO, "Couldn't create PropertySet: {0}", ex.getLocalizedMessage()); } catch (UnsupportedEncodingException ex) { log.log(Level.INFO, null, ex); } catch (IOException ex) { log.log(Level.INFO, null, ex); } catch (HPSFException ex) { log.log(Level.WARNING, "Couldn't construct HeadingPair vector.", ex); } finally { is.close(); } }
From source file:org.ddt.listener.ole.OleStreamListener.java
License:Apache License
/** * Reads a "\1Ole" stream and stores the links found in it. * <p/>//from w w w .j ava 2s . co m * This method returns (fairly) quietly if the stream fails (it doesn't throw exceptions) * There are a number of ways that this can fail, not all of which are bad. * For instance, POIFSReaderEvent doesn't contain an OLE stream, this is not * a disaster, we just need to return quickly. * * @param event document to process */ public void processPOIFSReaderEvent(POIFSReaderEvent event) { log.log(Level.FINEST, "Processing Document: {0}/{1}", new Object[] { event.getPath(), event.getName() }); DocumentInputStream docInStream = event.getStream(); if (docInStream.available() < LittleEndian.INT_SIZE) return; if (docInStream.readInt() != VALID_VERSION) { log.log(Level.INFO, "Invalid signature - not an OLE Stream."); docInStream.close(); return; } try { docInStream.skip(LittleEndian.INT_SIZE); //ignore what I think might be LinkUpdateOptions //check it's a linked object, not embedded if (docInStream.readInt() != 1) { log.log(Level.FINER, "Not a link"); docInStream.close(); return; } //check reserved field = 0 if (docInStream.readInt() != 0x000000) { docInStream.close(); return; } Moniker m; String relPath = null; String absPath = null; byte[] clsid = new byte[16]; //source moniker, not really interesting. if ((docInStream.readInt()) > 0) { docInStream.read(clsid); ClassID cid = new ClassID(clsid, 0); MonikerFactory.getMoniker(cid, docInStream); } if ((docInStream.readInt()) > 0) { docInStream.read(clsid); ClassID cid = new ClassID(clsid, 0); m = MonikerFactory.getMoniker(cid, docInStream); if (m != null) relPath = m.getLink(); } if ((docInStream.readInt()) > 0) { docInStream.read(clsid); ClassID cid = new ClassID(clsid, 0); m = MonikerFactory.getMoniker(cid, docInStream); if (m != null) absPath = m.getLink(); } Link l = new Link(1); l.addRelativePath(cleanURLString(relPath)); l.addAbsolutePath(absPath); this.add(l); } catch (IOException ex) { log.log(Level.FINE, ex.getLocalizedMessage()); } catch (BadOleStreamException ex) { log.log(Level.INFO, ex.getMessage()); } finally { docInStream.close(); } }