Example usage for org.apache.poi.hsmf.datatypes StringChunk getRawValue

List of usage examples for org.apache.poi.hsmf.datatypes StringChunk getRawValue

Introduction

In this page you can find the example usage for org.apache.poi.hsmf.datatypes StringChunk getRawValue.

Prototype

public byte[] getRawValue() 

Source Link

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

public void parse(XHTMLContentHandler xhtml, Metadata metadata)
        throws TikaException, SAXException, IOException {
    try {/*from  ww w.  ja v a  2s.co m*/
        msg.setReturnNullOnMissingChunk(true);

        // If the message contains strings that aren't stored
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            if (msg.getHeaders() != null) {
                // There's normally something in the headers
                msg.guess7BitEncoding();
            } else {
                // Nothing in the header, try encoding detection
                //  on the message body
                StringChunk text = msg.getMainChunks().textBodyChunk;
                if (text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText(text.getRawValue());
                    CharsetMatch match = detector.detect();
                    if (match.getConfidence() > 35) {
                        msg.set7BitEncoding(match.getName());
                    }
                }
            }
        }

        // Start with the metadata
        String subject = msg.getSubject();
        String from = msg.getDisplayFrom();

        metadata.set(TikaCoreProperties.CREATOR, from);
        metadata.set(Metadata.MESSAGE_FROM, from);
        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());

        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        } // Will be fixed in POI 3.7 Final

        // Date - try two ways to find it
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            try {
                // Failing that try via the raw headers 
                String[] headers = msg.getHeaders();
                if (headers != null && headers.length > 0) {
                    for (String header : headers) {
                        if (header.toLowerCase().startsWith("date:")) {
                            String date = header.substring(header.indexOf(':') + 1).trim();

                            // See if we can parse it as a normal mail date
                            try {
                                Date d = MboxParser.parseDate(date);
                                metadata.set(TikaCoreProperties.CREATED, d);
                                metadata.set(TikaCoreProperties.MODIFIED, d);
                            } catch (ParseException e) {
                                // Store it as-is, and hope for the best...
                                metadata.set(TikaCoreProperties.CREATED, date);
                                metadata.set(TikaCoreProperties.MODIFIED, date);
                            }
                            break;
                        }
                    }
                }
            } catch (ChunkNotFoundException he) {
                // We can't find the date, sorry...
            }
        }

        xhtml.element("h1", subject);

        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");

        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }

        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                HtmlParser htmlParser = new HtmlParser();
                htmlParser.parse(new ByteArrayInputStream(data),
                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                        new ParseContext());
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                    chunk.getValue());
            RTFParser rtfParser = new RTFParser();
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()),
                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                    new ParseContext());
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");

        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");

            String filename = null;
            if (attachment.attachLongFileName != null) {
                filename = attachment.attachLongFileName.getValue();
            } else if (attachment.attachFileName != null) {
                filename = attachment.attachFileName.getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }

            if (attachment.attachData != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null,
                        null, xhtml, true);
            }
            if (attachment.attachmentDirectory != null) {
                handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml);
            }

            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    }
}

From source file:org.alfresco.repo.content.transform.MSGParser.java

License:Apache License

/**
 * Extracts properties and text from an Msg Document input stream.
 *
 * @param stream//w  ww  .j a va  2 s.c o m
 *            the stream
 * @param handler
 *            the handler
 * @param metadata
 *            the metadata
 * @param context
 *            the context
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 * @throws SAXException
 *             the sAX exception
 * @throws TikaException
 *             the tika exception
 */
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    try {
        this.message = new MAPIMessage(new NPOIFSFileSystem(stream));
        message.setReturnNullOnMissingChunk(true);
        // // If the message contains strings that aren't stored
        // // as Unicode, try to sort out an encoding for them
        if (message.has7BitEncodingStrings()) {
            if (message.getHeaders() != null) {
                // There's normally something in the headers
                message.guess7BitEncoding();
                encoding = "utf-7";
            } else {
                // Nothing in the header, try encoding detection
                // on the message body
                StringChunk text = message.getMainChunks().textBodyChunk;
                if (text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText(text.getRawValue());
                    CharsetMatch match = detector.detect();
                    if (match.getConfidence() > 35) {
                        message.set7BitEncoding(match.getName());
                        encoding = match.getName();
                    }
                }
            }
        } else {
            encoding = UTF_8;
        }

        processHeader(message, metadata, xhtml);

        // real work.
        adaptedExtractMultipart(xhtml, message, context);

        xhtml.endDocument();

    } catch (Exception e) {
        throw new TikaException("Error while processing message", e);
    }
}

From source file:org.alfresco.repo.content.transform.MSGParser.java

License:Apache License

/**
 * Process header./*from  ww  w.jav  a2 s  .c  om*/
 *
 * @param msg
 *            the msg
 * @param metadata
 *            the metadata
 * @param xhtml
 *            the xhtml
 * @throws Exception
 *             the exception
 */
private void processHeader(MAPIMessage msg, Metadata metadata, XHTMLContentHandler xhtml) throws Exception {
    StringChunk subjectChunk = msg.getMainChunks().subjectChunk;
    if (msg.has7BitEncodingStrings()) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(subjectChunk.getRawValue());
        CharsetMatch detect = detector.detect();
        if (detect.getConfidence() >= 20) {
            subjectChunk.set7BitEncoding(detect.getName());
        }
    }
    String subject = subjectChunk.getValue();
    String from = msg.getDisplayFrom();

    metadata.set(DublinCore.CREATOR, from);
    metadata.set(Metadata.MESSAGE_FROM, from);
    metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
    metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
    metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

    metadata.set(DublinCore.TITLE, subject);
    metadata.set(DublinCore.SUBJECT, msg.getConversationTopic());

    try {
        for (String recipientAddress : msg.getRecipientEmailAddressList()) {
            if (recipientAddress != null)
                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
        }
    } catch (ChunkNotFoundException he) {
    } // Will be fixed in POI 3.7 Final

    // Date - try two ways to find it
    // First try via the proper chunk
    if (msg.getMessageDate() != null) {
        metadata.set(DublinCore.DATE, msg.getMessageDate().getTime());
        metadata.set(Office.CREATION_DATE, msg.getMessageDate().getTime());
        metadata.set(Office.SAVE_DATE, msg.getMessageDate().getTime());
    } else {
        try {
            // Failing that try via the raw headers
            String[] headers = msg.getHeaders();
            if (headers != null && headers.length > 0) {
                for (String header : headers) {
                    if (header.toLowerCase().startsWith("date:")) {
                        String date = header.substring(header.indexOf(':') + 1).trim();

                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(DublinCore.DATE, d);
                            metadata.set(Office.CREATION_DATE, d);
                            metadata.set(Office.SAVE_DATE, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(DublinCore.DATE, date);
                            metadata.set(Office.CREATION_DATE, date);
                            metadata.set(Office.SAVE_DATE, date);
                        }
                        break;
                    }
                }
            }
        } catch (ChunkNotFoundException he) {
            // We can't find the date, sorry...
        }
    }

    xhtml.element("h1", subject);

    // Output the from and to details in text, as you
    // often want them in text form for searching
    xhtml.startElement("dl");
    if (from != null) {
        header(xhtml, "From", from);
    }
    header(xhtml, "To", msg.getDisplayTo());
    header(xhtml, "Cc", msg.getDisplayCC());
    header(xhtml, "Bcc", msg.getDisplayBCC());
    try {
        header(xhtml, "Recipients", msg.getRecipientEmailAddress());
    } catch (ChunkNotFoundException e) {
    }
    List<String> attachmentList = new ArrayList<String>();
    // // prepare attachments
    prepareExtractMultipart(xhtml, message, attachmentList);
    if (attachmentList.size() > 0) {
        header(xhtml, "Attachments", attachmentList.toString());
    }
    xhtml.endElement("dl");

}

From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

/**
 * Tries to identify the correct encoding for 7-bit (non-unicode)
 *  strings in the file./*from   w w  w  .  jav  a  2 s  . co m*/
 * <p>Many messages store their strings as unicode, which is
 *  nice and easy. Some use one-byte encodings for their
 *  strings, but don't always store the encoding anywhere
 *  helpful in the file.</p>
 * <p>This method checks for codepage properties, and failing that
 *  looks at the headers for the message, and uses these to
 *  guess the correct encoding for your file.</p>
 * <p>Bug #49441 has more on why this is needed</p>
 * <p>This is taken verbatim from POI (TIKA-1238)
 * as a temporary workaround to prevent unsupported encoding exceptions</p>
 */
private void guess7BitEncoding(MAPIMessage msg) {
    Chunks mainChunks = msg.getMainChunks();
    //sanity check
    if (mainChunks == null) {
        return;
    }

    Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
    if (props != null) {
        // First choice is a codepage property
        for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE,
                MAPIProperty.INTERNET_CPID }) {
            List<PropertyValue> val = props.get(prop);
            if (val != null && val.size() > 0) {
                int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
                String encoding = null;
                try {
                    encoding = CodePageUtil.codepageToEncoding(codepage, true);
                } catch (UnsupportedEncodingException e) {
                    //swallow
                }
                if (tryToSet7BitEncoding(msg, encoding)) {
                    return;
                }
            }
        }
    }

    // Second choice is a charset on a content type header
    try {
        String[] headers = msg.getHeaders();
        if (headers != null && headers.length > 0) {
            // Look for a content type with a charset
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
                    Pattern.CASE_INSENSITIVE);

            for (String header : headers) {
                if (header.startsWith("Content-Type")) {
                    Matcher m = p.matcher(header);
                    if (m.matches()) {
                        // Found it! Tell all the string chunks
                        String charset = m.group(1);
                        if (tryToSet7BitEncoding(msg, charset)) {
                            return;
                        }
                    }
                }
            }
        }
    } catch (ChunkNotFoundException e) {
    }

    // Nothing suitable in the headers, try HTML
    // TODO: do we need to replicate this in Tika? If we wind up
    // parsing the html version of the email, this is duplicative??
    // Or do we need to reset the header strings based on the html
    // meta header if there is no other information?
    try {
        String html = msg.getHtmlBody();
        if (html != null && html.length() > 0) {
            Charset charset = null;
            try {
                charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA);
            } catch (IOException e) {
                //swallow
            }
            if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
                return;
            }
        }
    } catch (ChunkNotFoundException e) {
    }

    //absolute last resort, try charset detector
    StringChunk text = mainChunks.textBodyChunk;
    if (text != null) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(text.getRawValue());
        CharsetMatch match = detector.detect();
        if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
            return;
        }
    }
}