Example usage for org.apache.poi.hsmf.datatypes MAPIProperty BODY_HTML

List of usage examples for org.apache.poi.hsmf.datatypes MAPIProperty BODY_HTML

Introduction

In this page you can find the example usage for org.apache.poi.hsmf.datatypes MAPIProperty BODY_HTML.

Prototype

MAPIProperty BODY_HTML

To view the source code for org.apache.poi.hsmf.datatypes MAPIProperty BODY_HTML.

Click Source Link

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

public void parse(XHTMLContentHandler xhtml, Metadata metadata)
        throws TikaException, SAXException, IOException {
    try {//from ww  w .  jav  a  2 s.co  m
        msg.setReturnNullOnMissingChunk(true);

        // If the message contains strings that aren't stored
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            if (msg.getHeaders() != null) {
                // There's normally something in the headers
                msg.guess7BitEncoding();
            } else {
                // Nothing in the header, try encoding detection
                //  on the message body
                StringChunk text = msg.getMainChunks().textBodyChunk;
                if (text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText(text.getRawValue());
                    CharsetMatch match = detector.detect();
                    if (match.getConfidence() > 35) {
                        msg.set7BitEncoding(match.getName());
                    }
                }
            }
        }

        // Start with the metadata
        String subject = msg.getSubject();
        String from = msg.getDisplayFrom();

        metadata.set(TikaCoreProperties.CREATOR, from);
        metadata.set(Metadata.MESSAGE_FROM, from);
        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());

        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        } // Will be fixed in POI 3.7 Final

        // Date - try two ways to find it
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            try {
                // Failing that try via the raw headers 
                String[] headers = msg.getHeaders();
                if (headers != null && headers.length > 0) {
                    for (String header : headers) {
                        if (header.toLowerCase().startsWith("date:")) {
                            String date = header.substring(header.indexOf(':') + 1).trim();

                            // See if we can parse it as a normal mail date
                            try {
                                Date d = MboxParser.parseDate(date);
                                metadata.set(TikaCoreProperties.CREATED, d);
                                metadata.set(TikaCoreProperties.MODIFIED, d);
                            } catch (ParseException e) {
                                // Store it as-is, and hope for the best...
                                metadata.set(TikaCoreProperties.CREATED, date);
                                metadata.set(TikaCoreProperties.MODIFIED, date);
                            }
                            break;
                        }
                    }
                }
            } catch (ChunkNotFoundException he) {
                // We can't find the date, sorry...
            }
        }

        xhtml.element("h1", subject);

        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");

        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }

        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                HtmlParser htmlParser = new HtmlParser();
                htmlParser.parse(new ByteArrayInputStream(data),
                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                        new ParseContext());
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                    chunk.getValue());
            RTFParser rtfParser = new RTFParser();
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()),
                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                    new ParseContext());
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");

        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");

            String filename = null;
            if (attachment.attachLongFileName != null) {
                filename = attachment.attachLongFileName.getValue();
            } else if (attachment.attachFileName != null) {
                filename = attachment.attachFileName.getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }

            if (attachment.attachData != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null,
                        null, xhtml, true);
            }
            if (attachment.attachmentDirectory != null) {
                handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml);
            }

            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    }
}

From source file:org.alfresco.repo.content.transform.MSGParser.java

License:Apache License

/**
 * Adapted extract multipart is parser that extracts the html body if exists, rtf body if exists
 * or at least plain text. The html or rtf file could be obtained as alternative.
 *
 * @param xhtml//from ww  w  . java2  s .  c o m
 *            the xhtml
 * @param msg
 *            the message part
 * @param context
 *            the context
 * @throws MessagingException
 *             the messaging exception
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 * @throws SAXException
 *             the sAX exception
 * @throws TikaException
 *             the tika exception
 */
public void adaptedExtractMultipart(XHTMLContentHandler xhtml, MAPIMessage msg, ParseContext context)
        throws MessagingException, IOException, SAXException, TikaException {
    // Get the message body. Preference order is: html, rtf, text
    Chunk htmlChunk = null;
    Chunk rtfChunk = null;
    Chunk textChunk = null;
    for (Chunk chunk : msg.getMainChunks().getAll()) {
        if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
            htmlChunk = chunk;
        }
        if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
            rtfChunk = chunk;
        }
        if (chunk.getChunkId() == MAPIProperty.BODY.id) {
            textChunk = chunk;
        }
    }

    boolean doneBody = false;
    if (htmlChunk != null) {
        byte[] data = null;
        if (htmlChunk instanceof ByteChunk) {
            data = ((ByteChunk) htmlChunk).getValue();
        } else if (htmlChunk instanceof StringChunk) {
            data = ((StringChunk) htmlChunk).getRawValue();
        }
        File tempHtmlFile = new File(workingDirectory, System.currentTimeMillis() + ".html");
        BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempHtmlFile));
        byte[] preparedStringData = referencesCache.size() > 0 ? prepareHTMLString(new String(data)).getBytes()
                : data;
        IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream);
        IOUtils.closeQuietly(rtfOutStream);
        parsedContent.put(MimetypeMap.MIMETYPE_HTML, new Pair<File, String>(tempHtmlFile, encoding));
        doneBody = true;

    }
    if (rtfChunk != null && !doneBody) {
        ByteChunk chunk = (ByteChunk) rtfChunk;

        MAPIProperty property = MAPIProperty.RTF_COMPRESSED;
        int type = Types.BINARY.getId();
        byte[] data = chunk.getValue();
        MAPIRtfAttribute rtf = new MAPIRtfAttribute(property, type, data);

        File tempRtfFile = new File(workingDirectory, System.currentTimeMillis() + ".rtf");
        BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempRtfFile));

        byte[] preparedStringData = referencesCache.size() > 0
                ? prepareRTFString(new String(rtf.getData())).getBytes()
                : rtf.getData();
        IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream);
        IOUtils.closeQuietly(rtfOutStream);

        parsedContent.put(MIMETYPE_RTF, new Pair<File, String>(tempRtfFile, encoding));
        doneBody = true;
    }
    if (textChunk != null && !doneBody) {
        xhtml.element("p", ((StringChunk) textChunk).getValue());
    }

}

From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

public void parse(XHTMLContentHandler xhtml, Metadata metadata)
        throws TikaException, SAXException, IOException {
    try {/*from  w w  w . j  a  v a  2s.  co  m*/
        msg.setReturnNullOnMissingChunk(true);

        // If the message contains strings that aren't stored
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }

        // Start with the metadata
        String subject = msg.getSubject();
        String from = msg.getDisplayFrom();

        metadata.set(TikaCoreProperties.CREATOR, from);
        metadata.set(Metadata.MESSAGE_FROM, from);
        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());

        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        } // Will be fixed in POI 3.7 Final

        // Date - try two ways to find it
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            try {
                // Failing that try via the raw headers
                String[] headers = msg.getHeaders();
                if (headers != null && headers.length > 0) {
                    for (String header : headers) {
                        if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
                            String date = header.substring(header.indexOf(':') + 1).trim();

                            // See if we can parse it as a normal mail date
                            try {
                                Date d = MboxParser.parseDate(date);
                                metadata.set(TikaCoreProperties.CREATED, d);
                                metadata.set(TikaCoreProperties.MODIFIED, d);
                            } catch (ParseException e) {
                                // Store it as-is, and hope for the best...
                                metadata.set(TikaCoreProperties.CREATED, date);
                                metadata.set(TikaCoreProperties.MODIFIED, date);
                            }
                            break;
                        }
                    }
                }
            } catch (ChunkNotFoundException he) {
                // We can't find the date, sorry...
            }
        }

        xhtml.element("h1", subject);

        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");

        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }

        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                HtmlParser htmlParser = new HtmlParser();
                htmlParser.parse(new ByteArrayInputStream(data),
                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                        new ParseContext());
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                    chunk.getValue());
            RTFParser rtfParser = new RTFParser();
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()),
                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                    new ParseContext());
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");

        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");

            String filename = null;
            if (attachment.attachLongFileName != null) {
                filename = attachment.attachLongFileName.getValue();
            } else if (attachment.attachFileName != null) {
                filename = attachment.attachFileName.getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }

            if (attachment.attachData != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null,
                        null, xhtml, true);
            }
            if (attachment.attachmentDirectory != null) {
                handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml);
            }

            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    }
}