List of usage examples for org.apache.poi.util StringUtil getFromCompressedUnicode
public static String getFromCompressedUnicode(final byte[] string, final int offset, final int len)
From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java
License:Apache License
/** * Extracts the client text boxes of a slide. * //from w w w . j a v a 2 s.c o m * @param containerTextBox * @param offset * @param pptdata * @param offsetPD * @return Hashtable * @see TextBox */ protected Hashtable/* <Long, TextBox> */ extractTextBoxes(final Hashtable/* <Long, TextBox> */ containerTextBox, final int offset, final byte[] pptdata, final long offsetPD) { // To hold temporary data FilteredStringWriter outStream = new FilteredStringWriter(); TextBox textBox; // Traversing the bytearray up to Presist directory position for (int i = offset; i < offsetPD - 20; i++) { try { // Record info // final long rinfo = LittleEndian.getUShort(pptdata, (int) i); // Record Type final long recordType = LittleEndian.getUShort(pptdata, i + 2); // Record Size final long recordSize = LittleEndian.getUInt(pptdata, i + 4); if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Record type is of Drawing Group */ // Total number of objects // final long objectCount = LittleEndian.getUInt(pptdata, (int) i + // 8); // currentID = Group ID+number of objects long currentID = LittleEndian.getInt(pptdata, i + 12); currentID = ((int) (currentID / 1024)) * 1024; if (currentID == PPTConstants.PPT_MASTERSLIDE) { // Ignore Master Slide objects if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); } i++; continue; } // Check for the ClientTextBox GroupID existence if (containerTextBox.containsKey(new Long(currentID))) { // If exists get Client Textbox Group textBox = (TextBox) containerTextBox.get(new Long(currentID)); textBox.setContent(""); } else { textBox = new TextBox(currentID); containerTextBox.put(new Long(currentID), textBox); } /* * Iterating the bytearray for TextCharAtoms and TextBytesAtom */ if ((offsetPD - 20) != recordSize) { // TODO something wrong? Probably an OLE-Object, which we ignore. if (LOG.isDebugEnabled()) { LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize=" + recordSize); } } else { for (int startPos = i + 8; startPos < offsetPD - 20 && startPos < recordSize; startPos++) { // && startPos < // recordSize?? try { // Record info // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j); // Record Type final long ntype = LittleEndian.getUShort(pptdata, startPos + 2); // Record size // Note that the size doesn't include the 8 byte atom header final long nsize = LittleEndian.getUInt(pptdata, startPos + 4); if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) { /* * Break the loop if next GroupID found */ i = startPos - 1; break; } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) { // TextByteAtom record outStream = new FilteredStringWriter(); long ii = 0; for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) { // For loop to changed to a function // if ((ii + 2) >= pptdata.length) // break; // FIXME outStream.write((char) (pptdata[(int) ii + 2])); } // Setting the identified text for Current // groupID textBox.setContent(textBox.getContent() + outStream.toString()); } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) { // TextCharAtom record final String strTempContent = new String(pptdata, startPos + 6, (int) (nsize) + 2); final byte bytes[] = strTempContent.getBytes(); if (true) { outStream = new FilteredStringWriter(); for (int ii = 0; ii < bytes.length - 1; ii += 2) { // For loop to changed to a function outStream.write((char) (pptdata[ii + 2])); } textBox.setContent(textBox.getContent() + outStream.toString()); } else { // this version is used within POI String text = StringUtil.getFromCompressedUnicode(bytes, 0, bytes.length); textBox.setContent(textBox.getContent() + text); } } else { // ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Ignored atom type: " + type); // } } } catch (Throwable e) { if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); } break; } } } } else { // Record type is ignored // if (LOG.isTraceEnabled()) { // LOG.trace("Ignored record type: " + type); // } } } catch (Throwable ee) { if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); } break; } } return containerTextBox; }
From source file:org.apache.tika.parser.dwg.DWGParser.java
License:Apache License
private String read2004String(InputStream stream) throws IOException, TikaException { int stringLen = EndianUtils.readUShortLE(stream); byte[] stringData = new byte[stringLen]; IOUtils.readFully(stream, stringData); // Often but not always null terminated if (stringData[stringLen - 1] == 0) { stringLen--;// w w w . ja v a2s . co m } String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen); return value; }
From source file:org.apache.tika.parser.dwg.DWGParser.java
License:Apache License
private void get2000Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, TikaException, SAXException { int propCount = 0; while (propCount < 30) { int propIdx = EndianUtils.readUShortLE(stream); int length = EndianUtils.readUShortLE(stream); int valueType = stream.read(); if (propIdx == 0x28) { // This one seems not to follow the pattern length = 0x19;/* w ww. j av a 2 s . c om*/ } else if (propIdx == 90) { // We think this means the end of properties break; } byte[] value = new byte[length]; IOUtils.readFully(stream, value); if (valueType == 0x1e) { // Normal string, good String val = StringUtil.getFromCompressedUnicode(value, 0, length); // Is it one we can look up by index? if (propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) { metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val); xhtml.element("p", val); } else if (propIdx == 0x012c) { int splitAt = val.indexOf('='); if (splitAt > -1) { String propName = val.substring(0, splitAt); String propVal = val.substring(splitAt + 1); metadata.add(propName, propVal); } } } else { // No idea... } propCount++; } }
From source file:org.ddt.listener.dsi.StringProperty.java
License:Apache License
/** * reads the data from the byte array./*from ww w . j a v a2 s.c om*/ * * @param data byte array to read from * @param offset offset into the data array * @throws IllegalVariantTypeException */ private void read(final byte data[], final int offset) throws IllegalVariantTypeException, UnsupportedEncodingException { int o = offset; charCount = LittleEndian.getUInt(data, o); length = charCount; o += LittleEndian.INT_SIZE; if (type == Variant.VT_LPWSTR) { //the smallest number of bytes to pad it to a multiple of 4... there must be a nicer way paddingBytes = (int) (4 - (length % 4)) % 4; } else if (type == Variant.VT_LPSTR) { paddingBytes = 0; } else { throw new IllegalVariantTypeException(type, value, "At offset " + o + ": Not a string, type = " + Long.toHexString(type) + " should be " + Integer.toHexString(Variant.VT_LPSTR) + " or " + Integer.toHexString(Variant.VT_LPWSTR)); } length = Math.min(length, data.length - o); if (type == Variant.VT_LPWSTR) { // value = new String(LittleEndian.getByteArray(data, o, // (int) (length - 2)), "UTF-16LE"); value = StringUtil.getFromUnicodeLE(data, o, (int) (length - 1)); } else { // value = new String(LittleEndian.getByteArray(data, o, (int) (length - 1))); value = StringUtil.getFromCompressedUnicode(data, o, (int) (length - 1)); } }