Example usage for org.apache.poi.util StringUtil getFromCompressedUnicode

List of usage examples for org.apache.poi.util StringUtil getFromCompressedUnicode

Introduction

In this page you can find the example usage for org.apache.poi.util StringUtil getFromCompressedUnicode.

Prototype

public static String getFromCompressedUnicode(final byte[] string, final int offset, final int len) 

Source Link

Document

Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java String and return.

Usage

From source file:org.apache.nutch.parse.mspowerpoint.ContentReaderListener.java

License:Apache License

/**
 * Extracts the client text boxes of a slide.
 * //from   w  w  w  . j  a  v  a 2 s.c o  m
 * @param containerTextBox
 * @param offset
 * @param pptdata
 * @param offsetPD
 * @return Hashtable
 * @see TextBox
 */
protected Hashtable/* <Long, TextBox> */ extractTextBoxes(final Hashtable/* <Long, TextBox> */ containerTextBox,
        final int offset, final byte[] pptdata, final long offsetPD) {

    // To hold temporary data
    FilteredStringWriter outStream = new FilteredStringWriter();

    TextBox textBox;

    // Traversing the bytearray up to Presist directory position
    for (int i = offset; i < offsetPD - 20; i++) {
        try {
            // Record info
            // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
            // Record Type
            final long recordType = LittleEndian.getUShort(pptdata, i + 2);
            // Record Size
            final long recordSize = LittleEndian.getUInt(pptdata, i + 4);

            if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
                /*
                 * Record type is of Drawing Group
                 */

                // Total number of objects
                // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
                // 8);
                // currentID = Group ID+number of objects
                long currentID = LittleEndian.getInt(pptdata, i + 12);
                currentID = ((int) (currentID / 1024)) * 1024;

                if (currentID == PPTConstants.PPT_MASTERSLIDE) {
                    // Ignore Master Slide objects
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Ignore master slide.");
                    }
                    i++;
                    continue;
                }

                // Check for the ClientTextBox GroupID existence
                if (containerTextBox.containsKey(new Long(currentID))) {
                    // If exists get Client Textbox Group
                    textBox = (TextBox) containerTextBox.get(new Long(currentID));
                    textBox.setContent("");

                } else {
                    textBox = new TextBox(currentID);
                    containerTextBox.put(new Long(currentID), textBox);
                }

                /*
                 * Iterating the bytearray for TextCharAtoms and TextBytesAtom
                 */
                if ((offsetPD - 20) != recordSize) {
                    // TODO something wrong? Probably an OLE-Object, which we ignore.
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize=" + recordSize);
                    }
                } else {
                    for (int startPos = i + 8; startPos < offsetPD - 20 && startPos < recordSize; startPos++) { // && startPos <
                        // recordSize??
                        try {

                            // Record info
                            // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);

                            // Record Type
                            final long ntype = LittleEndian.getUShort(pptdata, startPos + 2);

                            // Record size
                            // Note that the size doesn't include the 8 byte atom header
                            final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);

                            if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
                                /*
                                 * Break the loop if next GroupID found
                                 */
                                i = startPos - 1;
                                break;
                            } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
                                // TextByteAtom record
                                outStream = new FilteredStringWriter();
                                long ii = 0;
                                for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
                                    // For loop to changed to a function
                                    // if ((ii + 2) >= pptdata.length)
                                    // break; // FIXME
                                    outStream.write((char) (pptdata[(int) ii + 2]));
                                }

                                // Setting the identified text for Current
                                // groupID
                                textBox.setContent(textBox.getContent() + outStream.toString());

                            } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
                                // TextCharAtom record

                                final String strTempContent = new String(pptdata, startPos + 6,
                                        (int) (nsize) + 2);
                                final byte bytes[] = strTempContent.getBytes();
                                if (true) {
                                    outStream = new FilteredStringWriter();
                                    for (int ii = 0; ii < bytes.length - 1; ii += 2) {
                                        // For loop to changed to a function
                                        outStream.write((char) (pptdata[ii + 2]));
                                    }
                                    textBox.setContent(textBox.getContent() + outStream.toString());
                                } else {
                                    // this version is used within POI
                                    String text = StringUtil.getFromCompressedUnicode(bytes, 0, bytes.length);
                                    textBox.setContent(textBox.getContent() + text);
                                }

                            } else {
                                // ignored
                                // if (LOG.isTraceEnabled()) {
                                //   LOG.trace("Ignored atom type: " + type);
                                // }
                            }
                        } catch (Throwable e) {
                            if (LOG.isErrorEnabled()) {
                                LOG.error("extractTextBoxes", e);
                            }
                            break;
                        }
                    }
                }
            } else {
                // Record type is ignored
                // if (LOG.isTraceEnabled()) {
                //   LOG.trace("Ignored record type: " + type);
                // }
            }
        } catch (Throwable ee) {
            if (LOG.isErrorEnabled()) {
                LOG.error("extractClientTextBoxes", ee);
            }
            break;
        }
    }
    return containerTextBox;
}

From source file:org.apache.tika.parser.dwg.DWGParser.java

License:Apache License

private String read2004String(InputStream stream) throws IOException, TikaException {
    int stringLen = EndianUtils.readUShortLE(stream);

    byte[] stringData = new byte[stringLen];
    IOUtils.readFully(stream, stringData);

    // Often but not always null terminated
    if (stringData[stringLen - 1] == 0) {
        stringLen--;//  w w  w .  ja v a2s  . co  m
    }
    String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
    return value;
}

From source file:org.apache.tika.parser.dwg.DWGParser.java

License:Apache License

private void get2000Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
        throws IOException, TikaException, SAXException {
    int propCount = 0;
    while (propCount < 30) {
        int propIdx = EndianUtils.readUShortLE(stream);
        int length = EndianUtils.readUShortLE(stream);
        int valueType = stream.read();

        if (propIdx == 0x28) {
            // This one seems not to follow the pattern
            length = 0x19;/* w ww.  j  av  a  2  s  .  c om*/
        } else if (propIdx == 90) {
            // We think this means the end of properties
            break;
        }

        byte[] value = new byte[length];
        IOUtils.readFully(stream, value);
        if (valueType == 0x1e) {
            // Normal string, good
            String val = StringUtil.getFromCompressedUnicode(value, 0, length);

            // Is it one we can look up by index?
            if (propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
                metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
                xhtml.element("p", val);
            } else if (propIdx == 0x012c) {
                int splitAt = val.indexOf('=');
                if (splitAt > -1) {
                    String propName = val.substring(0, splitAt);
                    String propVal = val.substring(splitAt + 1);
                    metadata.add(propName, propVal);
                }
            }
        } else {
            // No idea...
        }

        propCount++;
    }
}

From source file:org.ddt.listener.dsi.StringProperty.java

License:Apache License

/**
 * reads the data from the byte array./*from   ww w . j  a  v a2  s.c om*/
 *
 * @param data   byte array to read from
 * @param offset offset into the data array
 * @throws IllegalVariantTypeException
 */
private void read(final byte data[], final int offset)
        throws IllegalVariantTypeException, UnsupportedEncodingException {
    int o = offset;
    charCount = LittleEndian.getUInt(data, o);
    length = charCount;
    o += LittleEndian.INT_SIZE;
    if (type == Variant.VT_LPWSTR) {
        //the smallest number of bytes to pad it to a multiple of 4... there must be a nicer way
        paddingBytes = (int) (4 - (length % 4)) % 4;
    } else if (type == Variant.VT_LPSTR) {
        paddingBytes = 0;
    } else {
        throw new IllegalVariantTypeException(type, value,
                "At offset " + o + ": Not a string, type = " + Long.toHexString(type) + " should be "
                        + Integer.toHexString(Variant.VT_LPSTR) + " or "
                        + Integer.toHexString(Variant.VT_LPWSTR));
    }

    length = Math.min(length, data.length - o);
    if (type == Variant.VT_LPWSTR) {
        //            value = new String(LittleEndian.getByteArray(data, o,
        //                    (int) (length - 2)), "UTF-16LE");
        value = StringUtil.getFromUnicodeLE(data, o, (int) (length - 1));
    } else {
        //            value = new String(LittleEndian.getByteArray(data, o, (int) (length - 1)));
        value = StringUtil.getFromCompressedUnicode(data, o, (int) (length - 1));
    }
}