List of usage examples for org.apache.poi.util LittleEndian getUInt
public static long getUInt(byte[] data, int offset)
From source file:com.argo.hwp.v5.HwpTextExtractorV5.java
License:Open Source License
/** * HWP? FileHeader // w w w . jav a2 s . c om * * @param fs * @return * @throws IOException */ private static FileHeader getHeader(NPOIFSFileSystem fs) throws IOException { DirectoryNode root = fs.getRoot(); // ??? p.18 // FileHeader Entry headerEntry = root.getEntry("FileHeader"); if (!headerEntry.isDocumentEntry()) return null; // ? byte[] header = new byte[256]; // FileHeader ? 256 DocumentInputStream headerStream = new DocumentInputStream((DocumentEntry) headerEntry); try { int read = headerStream.read(header); if (read != 256 || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(header, 0, HWP_V5_SIGNATURE.length))) return null; } finally { headerStream.close(); } FileHeader fileHeader = new FileHeader(); // . debug fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(header, 32)); long flags = LittleEndian.getUInt(header, 36); log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0')); fileHeader.compressed = (flags & 0x01) == 0x01; fileHeader.encrypted = (flags & 0x02) == 0x02; fileHeader.viewtext = (flags & 0x04) == 0x04; return fileHeader; }
From source file:com.duroty.lucene.parser.MSPowerPointParser.java
License:Open Source License
/** * DOCUMENT ME!/*from ww w.ja v a2 s .c o m*/ * * @param event DOCUMENT ME! */ public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { if (!event.getName().equalsIgnoreCase("PowerPoint Document")) { return; } DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); byte[] espace = new String("\n\n").getBytes(); for (int i = 0; i < (buffer.length - 20); i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { writer.write(buffer, i + 4 + 1, (int) size + 3); writer.write(espace); i = (i + 4 + 1 + (int) size) - 1; } /*if (sleep > 0) { try { Thread.sleep(sleep); } catch (Exception ex) { } }*/ } } catch (Exception ex) { } }
From source file:com.flexive.extractor.PowerpointExtractor.java
License:Open Source License
private void processContent(byte[] buffer, int beginIndex, int endIndex) { while (beginIndex < endIndex) { int containerFlag = LittleEndian.getUShort(buffer, beginIndex); int recordType = LittleEndian.getUShort(buffer, beginIndex + 2); long recordLength = LittleEndian.getUInt(buffer, beginIndex + 4); beginIndex += 8;/*from ww w. j a v a2s.c om*/ if ((containerFlag & 0x0f) == 0x0f) { processContent(buffer, beginIndex, beginIndex + (int) recordLength); } else if (recordType == 4008) { writer.write(buffer, beginIndex, (int) recordLength); writer.write(' '); } beginIndex += (int) recordLength; } }
From source file:lius.index.powerpoint.PPTIndexer.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {// w ww. ja v a 2 s. c om if (!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008L) { writer.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception ex) { logger.error(ex.getMessage()); } }
From source file:net.sf.mmm.content.parser.impl.poi.ContentParserPpt.java
License:Apache License
/** * //from w w w. j a v a 2 s . c om * @param buffer * @param offset * @param length * @param textBuffer * @throws UnsupportedEncodingException */ private void extractRecursive(byte[] buffer, int offset, int length, StringBuffer textBuffer) throws UnsupportedEncodingException { int offsetLength = offset + length - 8; if (offsetLength > buffer.length - 8) { /* * System.out.println("Illegal array index: offset=" + offset + ", * length=" + length + ", bufferSize=" + buffer.length); */ offsetLength = buffer.length - 8; } int index = offset; while (index < offsetLength) { // int info = LittleEndian.getUShort(buffer, index + // PPT_RECORD_INFO_OFFSET); int type = LittleEndian.getUShort(buffer, index + PPT_RECORD_TYPE_OFFSET); long longSize = LittleEndian.getUInt(buffer, index + PPT_RECORD_SIZE_OFFSET); // System.out.println("Index is: " + index); // System.out.println("record info: " + info); // System.out.println("record type: " + type); // System.out.println("record size: " + longSize); int size = (int) longSize; if (size < 0) { // System.out.println("size truncated: " + longSize + ""); return; } index += PPT_RECORD_LENGTH; if (type == PPT_TYPE_SLIDE_LIST_WITH_TEXT) { extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_HEADER_FOOTER) { extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_DOCUMENT) { extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_SLIDE) { // extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_MAIN_MASTER) { // extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_CHAR_STRING) { String text = new String(buffer, index, size, ENCODING_UTF16LE); textBuffer.append(text); textBuffer.append('\n'); } else if (type == PPT_TYPE_TEXT_BYTES_ATOM) { String text = new String(buffer, index, size); textBuffer.append(text); textBuffer.append('\n'); } else if (type == PPT_TYPE_TEXT_CHARS_ATOM) { String text = new String(buffer, index, size, ENCODING_UTF16LE); textBuffer.append(text); textBuffer.append('\n'); } else if (type == PPT_TYPE_NOTES) { // --> PPT_TYPE_DRAWING extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_DRAWING) { // --> PPT_TYPE_ESCHER_DG_CONTAINER extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_ESCHER_DG_CONTAINER) { // --> PPT_TYPE_ESCHER_SPGR_CONTAINER extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_ESCHER_SPGR_CONTAINER) { // --> PPT_TYPE_ESCHER_SP_CONTAINER extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_ESCHER_SP_CONTAINER) { // --> PPT_TYPE_ESCHER_TEXTBOX extractRecursive(buffer, index, size, textBuffer); } else if (type == PPT_TYPE_ESCHER_TEXTBOX) { // --> PPT_TYPE_TEXT_BYTES_ATOM extractRecursive(buffer, index, size, textBuffer); /* * } else if (type == PPT_TYPE_DOCUMENT_ATOM) { // ignore } else if * (type == PPT_TYPE_SLIDE_ATOM) { // ignore } else if (type == * PPT_TYPE_NOTES_ATOM) { // ignore } else if (type == * PPT_TYPE_ENVIRONMENT) { // ignore } else if (type == * PPT_TYPE_SLIDE_PERSIST_ATOM) { // ignore } else if (type == * PPT_TYPE_EXTENDED_OBJECT_LIST) { // ignore } else if (type == * PPT_TYPE_DRAWING_GROUP) { // ignore } else if (type == * PPT_TYPE_ESCHER_DG) { // ignore } else if (type == * PPT_TYPE_INTERACTIVE_INFO) { // ignore } else if (type == * PPT_TYPE_HEADER_FOOTER_ATOM) { // ignore } else if (type == * PPT_TYPE_SPEC_INFO_ATOM) { // ignore } else if (type == * PPT_TYPE_STYLE_TEXT_PROPERTY_ATOM) { // ignore } else if (type == * PPT_TYPE_SLIDE_ATOM) { // ignore } else if (type == * PPT_TYPE_SLIDE_PERSIST_ATOM) { // ignore } else if (type == * PPT_TYPE_SLIDE_PERSIST_ATOM) { // ignore } else if (type == * PPT_TYPE_TEXT_HEADER_ATOM) { // ignore } else if (type == * PPT_TYPE_TX_INTERACTIVE_INFO_ATOM) { // ignore */ } index += size; } }
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Reads the internal PowerPoint document stream. * //from w w w. j av a2 s . c o m * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent) */ public void processPOIFSReaderEvent(final POIFSReaderEvent event) { if (event == null || event.getName() == null || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) { if (LOG.isWarnEnabled()) { LOG.warn("Stream not processed. It is not a PowerPoint document: : " + event.getName()); } return; } try { final DocumentInputStream dis = event.getStream(); final byte pptdata[] = new byte[dis.available()]; dis.read(pptdata, 0, dis.available()); int offset = 0; long offsetPD = 0; /* * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text * in all PlaceHolders to hold PPTClientTextBox objects for mapping into * Slide Objects */ Hashtable/* <Long, TextBox> */ containerTextBox = new Hashtable/* * <Long, * TextBox> */(); // Traverse ByteArray to identiy edit paths of ClientTextBoxes long n = pptdata.length - 20; for (long i = 0; i < n; i++) { final long type = LittleEndian.getUShort(pptdata, (int) i + 2); // final long size = LittleEndian.getUInt(pptdata, (int) i + 4); if (PPTConstants.PPT_ATOM_USEREDIT == type) { /* * Checking the Record Header (UserEditAtom) */ // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8); // final long version = LittleEndian.getUInt(pptdata, (int) i + 12); offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16); offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20); /* * Call to extract ClientTextBox text in each UserEditAtom */ containerTextBox = extractTextBoxes(containerTextBox, offset, pptdata, offsetPD); } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type); // } } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type); // } } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) { // if (LOG.isTraceEnabled()) { // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type); // } } else { // no action // if (LOG.isTraceEnabled()) { // LOG.trace("type not handled: " + type); // } } } final List/* <PPTSlide> */ slides = extractSlides(offset, pptdata, offsetPD); if (slides.size() == 0) { if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); } } else { Slide slide = (Slide) slides.get(slides.size() - 1); for (Enumeration enumeration = containerTextBox.elements(); enumeration.hasMoreElements();) { final TextBox textBox = (TextBox) enumeration.nextElement(); slide.addContent(textBox.getContent()); } /* * Merging TextBox data with Slide Data Printing the text from Slides * vector object. */ List scontent; for (int i = 0; i < slides.size(); i++) { slide = (Slide) slides.get(i); scontent = slide.getContent(); String contentText; for (int j = 0; j < scontent.size(); j++) { contentText = scontent.get(j).toString(); this.buf.append(contentText); // to avoid concatinated words we add a blank additional if (contentText.length() > 0 && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) { this.buf.append(" "); } } } } } catch (Throwable ex) { // because of not killing complete crawling all Throwables are catched. if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); } } }
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Extracts the client text boxes of a slide. * //from ww w . ja va2s . c o m * @param containerTextBox * @param offset * @param pptdata * @param offsetPD * @return Hashtable * @see TextBox */ protected Hashtable/* <Long, TextBox> */ extractTextBoxes(final Hashtable/* <Long, TextBox> */ containerTextBox, final int offset, final byte[] pptdata, final long offsetPD) { // To hold temporary data FilteredStringWriter outStream = new FilteredStringWriter(); TextBox textBox; // Traversing the bytearray up to Presist directory position for (int i = offset; i < offsetPD - 20; i++) { try { // Record info // final long rinfo = LittleEndian.getUShort(pptdata, (int) i); // Record Type final long recordType = LittleEndian.getUShort(pptdata, i + 2); // Record Size final long recordSize = LittleEndian.getUInt(pptdata, i + 4); if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Record type is of Drawing Group */ // Total number of objects // final long objectCount = LittleEndian.getUInt(pptdata, (int) i + // 8); // currentID = Group ID+number of objects long currentID = LittleEndian.getInt(pptdata, i + 12); currentID = ((int) (currentID / 1024)) * 1024; if (currentID == PPTConstants.PPT_MASTERSLIDE) { // Ignore Master Slide objects if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); } i++; continue; } // Check for the ClientTextBox GroupID existence if (containerTextBox.containsKey(new Long(currentID))) { // If exists get Client Textbox Group textBox = (TextBox) containerTextBox.get(new Long(currentID)); textBox.setContent(""); } else { textBox = new TextBox(currentID); containerTextBox.put(new Long(currentID), textBox); } /* * Iterating the bytearray for TextCharAtoms and TextBytesAtom */ if ((offsetPD - 20) != recordSize) { // TODO something wrong? Probably an OLE-Object, which we ignore. if (LOG.isDebugEnabled()) { LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize=" + recordSize); } } else { for (int startPos = i + 8; startPos < offsetPD - 20 && startPos < recordSize; startPos++) { // && startPos < // recordSize?? try { // Record info // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j); // Record Type final long ntype = LittleEndian.getUShort(pptdata, startPos + 2); // Record size // Note that the size doesn't include the 8 byte atom header final long nsize = LittleEndian.getUInt(pptdata, startPos + 4); if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Break the loop if next GroupID found */ i = startPos - 1; break; } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) { // TextByteAtom record outStream = new FilteredStringWriter(); long ii = 0; for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) { // For loop to changed to a function // if ((ii + 2) >= pptdata.length) // break; // FIXME outStream.write((char) (pptdata[(int) ii + 2])); } // Setting the identified text for Current // groupID textBox.setContent(textBox.getContent() + outStream.toString()); } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) { // TextCharAtom record final String strTempContent = new String(pptdata, startPos + 6, (int) (nsize) + 2); final byte bytes[] = strTempContent.getBytes(); if (true) { outStream = new FilteredStringWriter(); for (int ii = 0; ii < bytes.length - 1; ii += 2) { // For loop to changed to a function outStream.write((char) (pptdata[ii + 2])); } textBox.setContent(textBox.getContent() + outStream.toString()); } else { // this version is used within POI String text = StringUtil.getFromCompressedUnicode(bytes, 0, bytes.length); textBox.setContent(textBox.getContent() + text); } } else { // ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Ignored atom type: " + type); // } } } catch (Throwable e) { if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); } break; } } } } else { // Record type is ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Ignored record type: " + type); // } } } catch (Throwable ee) { if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); } break; } } return containerTextBox; }
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Returns the Powerpoint <code>Slide</code> s of document as vector. * //from w w w .j a va2s .c o m * @param offset * @param pptdata * @param offsetPD * @return Vector of the powerpoint slides. Contains * <code>{@link Slide Slide}</code> * @see Slide */ protected List /* <Slide> */ extractSlides(final long offset, final byte[] pptdata, final long offsetPD) { int sNum = 0; // List of all slides found final List/* <Slide> */ slides = new Vector/* <Slide> */(); // current slide data Slide currentSlide = null; // To store data found in TextCharAtoms and TextBytesAtoms FilteredStringWriter outStream; for (long i = offset; i < pptdata.length - 20; i++) { final long recordInfo = LittleEndian.getUShort(pptdata, (int) i); final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2); final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4); if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) { /* * TextByteAtom record */ outStream = new FilteredStringWriter(); for (long ii = i + 6; (ii <= i + 6 + atomSize) && (ii + 2 < pptdata.length); ii++) { try { // if(ii+2 >= pptdata.length) break; //FIXME byte value = pptdata[(int) ii + 2]; outStream.write(value); } catch (ArrayIndexOutOfBoundsException ex) { if (LOG.isTraceEnabled()) { LOG.trace("size=" + pptdata.length); } if (LOG.isErrorEnabled()) { LOG.error("extractSlides", ex); } } } // Setting the identified text for Current Slide if (currentSlide != null) { currentSlide.addContent(outStream.toString()); } } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) { /* * TextCharAtom record */ outStream = new FilteredStringWriter(); final String strTempContent = new String(pptdata, (int) i + 6, (int) (atomSize) + 2); final byte bytes[] = strTempContent.getBytes(); for (int ii = 0; ii < bytes.length - 1; ii += 2) { outStream.write(Utils.getUnicodeCharacter(bytes, ii)); } // Setting the identified text for Current Slide if (currentSlide != null) { currentSlide.addContent(outStream.toString()); } } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) { /* * SlidePresistAtom Record */ if (sNum != 0) { outStream = new FilteredStringWriter(); final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20); currentSlide = new Slide(slideID); // currentSlide.addContent(outStream.toString()); slides.add(currentSlide); } sNum++; } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Diagram records are ignored */ if (LOG.isTraceEnabled()) { LOG.trace("Drawing Groups are ignored."); } break; } else { // ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Unhandled atomType: " + atomType); // } } } return slides; }
From source file:org.apache.slide.extractor.MSPowerPointExtractor.java
License:Apache License
public void processPOIFSReaderEvent(POIFSReaderEvent event) { try {// ww w . ja v a 2s . co m if (!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for (int i = 0; i < buffer.length - 20; i++) { long type = LittleEndian.getUShort(buffer, i + 2); long size = LittleEndian.getUInt(buffer, i + 4); if (type == 4008) { writer.write(buffer, i + 4 + 1, (int) size + 3); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception e) { } }
From source file:org.ddt.listener.dsi.HeadingPairProperty.java
License:Apache License
/** * the constructor./*from w w w .j a va 2 s .c o m*/ * * * @param data the data to read from. * @param dataOffset the offset into the * <code>data</code> byte array. * @param docPartsOffset the offset of the corresponding docparts. * @throws IllegalVariantTypeException if the data is malformed. * @throws UnsupportedEncodingException */ HeadingPairProperty(byte[] data, int dataOffset, int docPartsOffset) throws IllegalVariantTypeException, UnsupportedEncodingException { int off = dataOffset; name = new StringProperty(data, off); off += name.getSize(); long type = LittleEndian.getUInt(data, off); if (type != Variant.VT_I4) { log.log(Level.WARNING, "Not a proper VT_I4 type."); throw new IllegalVariantTypeException(type, name); } off += LittleEndian.INT_SIZE; //this is a horrible workaround, around the bug in HPSF, that returns //cutoff byte arrays from Section.getProperty() (HPFS Bug #52337) //It hopes that there aren't too many parts per heading (i.e. worst //case it can be store in one byte...) int left = data.length - off; if (left >= LittleEndian.INT_SIZE) { partsCount = (int) LittleEndian.getUInt(data, off); off += LittleEndian.INT_SIZE; } else if (left >= LittleEndian.SHORT_SIZE) { partsCount = LittleEndian.getShort(data, off); off += left; } else if (left >= LittleEndian.BYTE_SIZE) { partsCount = LittleEndian.getUByte(data, off); off += left; } else { partsCount = 1; //default... maybe not a good idea. } size = off - dataOffset; this.docPartsOffset = docPartsOffset; }