Example usage for org.apache.poi.hmef.attribute MAPIRtfAttribute MAPIRtfAttribute

List of usage examples for org.apache.poi.hmef.attribute MAPIRtfAttribute MAPIRtfAttribute

Introduction

In this page you can find the example usage for org.apache.poi.hmef.attribute MAPIRtfAttribute MAPIRtfAttribute.

Prototype

public MAPIRtfAttribute(MAPIProperty property, int type, byte[] data) throws IOException 

Source Link

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

public void parse(XHTMLContentHandler xhtml, Metadata metadata)
        throws TikaException, SAXException, IOException {
    try {//from   ww w  .  j av a 2 s  .  co  m
        msg.setReturnNullOnMissingChunk(true);

        // If the message contains strings that aren't stored
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            if (msg.getHeaders() != null) {
                // There's normally something in the headers
                msg.guess7BitEncoding();
            } else {
                // Nothing in the header, try encoding detection
                //  on the message body
                StringChunk text = msg.getMainChunks().textBodyChunk;
                if (text != null) {
                    CharsetDetector detector = new CharsetDetector();
                    detector.setText(text.getRawValue());
                    CharsetMatch match = detector.detect();
                    if (match.getConfidence() > 35) {
                        msg.set7BitEncoding(match.getName());
                    }
                }
            }
        }

        // Start with the metadata
        String subject = msg.getSubject();
        String from = msg.getDisplayFrom();

        metadata.set(TikaCoreProperties.CREATOR, from);
        metadata.set(Metadata.MESSAGE_FROM, from);
        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());

        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        } // Will be fixed in POI 3.7 Final

        // Date - try two ways to find it
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            try {
                // Failing that try via the raw headers 
                String[] headers = msg.getHeaders();
                if (headers != null && headers.length > 0) {
                    for (String header : headers) {
                        if (header.toLowerCase().startsWith("date:")) {
                            String date = header.substring(header.indexOf(':') + 1).trim();

                            // See if we can parse it as a normal mail date
                            try {
                                Date d = MboxParser.parseDate(date);
                                metadata.set(TikaCoreProperties.CREATED, d);
                                metadata.set(TikaCoreProperties.MODIFIED, d);
                            } catch (ParseException e) {
                                // Store it as-is, and hope for the best...
                                metadata.set(TikaCoreProperties.CREATED, date);
                                metadata.set(TikaCoreProperties.MODIFIED, date);
                            }
                            break;
                        }
                    }
                }
            } catch (ChunkNotFoundException he) {
                // We can't find the date, sorry...
            }
        }

        xhtml.element("h1", subject);

        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");

        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }

        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                HtmlParser htmlParser = new HtmlParser();
                htmlParser.parse(new ByteArrayInputStream(data),
                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                        new ParseContext());
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                    chunk.getValue());
            RTFParser rtfParser = new RTFParser();
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()),
                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                    new ParseContext());
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");

        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");

            String filename = null;
            if (attachment.attachLongFileName != null) {
                filename = attachment.attachLongFileName.getValue();
            } else if (attachment.attachFileName != null) {
                filename = attachment.attachFileName.getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }

            if (attachment.attachData != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null,
                        null, xhtml, true);
            }
            if (attachment.attachmentDirectory != null) {
                handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml);
            }

            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    }
}

From source file:org.alfresco.repo.content.transform.MSGParser.java

License:Apache License

/**
 * Adapted extract multipart is parser that extracts the html body if exists, rtf body if exists
 * or at least plain text. The html or rtf file could be obtained as alternative.
 *
 * @param xhtml/*from w w  w .  java  2s.  c om*/
 *            the xhtml
 * @param msg
 *            the message part
 * @param context
 *            the context
 * @throws MessagingException
 *             the messaging exception
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 * @throws SAXException
 *             the sAX exception
 * @throws TikaException
 *             the tika exception
 */
public void adaptedExtractMultipart(XHTMLContentHandler xhtml, MAPIMessage msg, ParseContext context)
        throws MessagingException, IOException, SAXException, TikaException {
    // Get the message body. Preference order is: html, rtf, text
    Chunk htmlChunk = null;
    Chunk rtfChunk = null;
    Chunk textChunk = null;
    for (Chunk chunk : msg.getMainChunks().getAll()) {
        if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
            htmlChunk = chunk;
        }
        if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
            rtfChunk = chunk;
        }
        if (chunk.getChunkId() == MAPIProperty.BODY.id) {
            textChunk = chunk;
        }
    }

    boolean doneBody = false;
    if (htmlChunk != null) {
        byte[] data = null;
        if (htmlChunk instanceof ByteChunk) {
            data = ((ByteChunk) htmlChunk).getValue();
        } else if (htmlChunk instanceof StringChunk) {
            data = ((StringChunk) htmlChunk).getRawValue();
        }
        File tempHtmlFile = new File(workingDirectory, System.currentTimeMillis() + ".html");
        BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempHtmlFile));
        byte[] preparedStringData = referencesCache.size() > 0 ? prepareHTMLString(new String(data)).getBytes()
                : data;
        IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream);
        IOUtils.closeQuietly(rtfOutStream);
        parsedContent.put(MimetypeMap.MIMETYPE_HTML, new Pair<File, String>(tempHtmlFile, encoding));
        doneBody = true;

    }
    if (rtfChunk != null && !doneBody) {
        ByteChunk chunk = (ByteChunk) rtfChunk;

        MAPIProperty property = MAPIProperty.RTF_COMPRESSED;
        int type = Types.BINARY.getId();
        byte[] data = chunk.getValue();
        MAPIRtfAttribute rtf = new MAPIRtfAttribute(property, type, data);

        File tempRtfFile = new File(workingDirectory, System.currentTimeMillis() + ".rtf");
        BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempRtfFile));

        byte[] preparedStringData = referencesCache.size() > 0
                ? prepareRTFString(new String(rtf.getData())).getBytes()
                : rtf.getData();
        IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream);
        IOUtils.closeQuietly(rtfOutStream);

        parsedContent.put(MIMETYPE_RTF, new Pair<File, String>(tempRtfFile, encoding));
        doneBody = true;
    }
    if (textChunk != null && !doneBody) {
        xhtml.element("p", ((StringChunk) textChunk).getValue());
    }

}

From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

public void parse(XHTMLContentHandler xhtml, Metadata metadata)
        throws TikaException, SAXException, IOException {
    try {//from   ww  w .j a  v a 2s.c  om
        msg.setReturnNullOnMissingChunk(true);

        // If the message contains strings that aren't stored
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }

        // Start with the metadata
        String subject = msg.getSubject();
        String from = msg.getDisplayFrom();

        metadata.set(TikaCoreProperties.CREATOR, from);
        metadata.set(Metadata.MESSAGE_FROM, from);
        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());

        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        } // Will be fixed in POI 3.7 Final

        // Date - try two ways to find it
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            try {
                // Failing that try via the raw headers
                String[] headers = msg.getHeaders();
                if (headers != null && headers.length > 0) {
                    for (String header : headers) {
                        if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
                            String date = header.substring(header.indexOf(':') + 1).trim();

                            // See if we can parse it as a normal mail date
                            try {
                                Date d = MboxParser.parseDate(date);
                                metadata.set(TikaCoreProperties.CREATED, d);
                                metadata.set(TikaCoreProperties.MODIFIED, d);
                            } catch (ParseException e) {
                                // Store it as-is, and hope for the best...
                                metadata.set(TikaCoreProperties.CREATED, date);
                                metadata.set(TikaCoreProperties.MODIFIED, date);
                            }
                            break;
                        }
                    }
                }
            } catch (ChunkNotFoundException he) {
                // We can't find the date, sorry...
            }
        }

        xhtml.element("h1", subject);

        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");

        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }

        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                HtmlParser htmlParser = new HtmlParser();
                htmlParser.parse(new ByteArrayInputStream(data),
                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                        new ParseContext());
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                    chunk.getValue());
            RTFParser rtfParser = new RTFParser();
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()),
                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
                    new ParseContext());
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");

        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");

            String filename = null;
            if (attachment.attachLongFileName != null) {
                filename = attachment.attachLongFileName.getValue();
            } else if (attachment.attachFileName != null) {
                filename = attachment.attachFileName.getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }

            if (attachment.attachData != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null,
                        null, xhtml, true);
            }
            if (attachment.attachmentDirectory != null) {
                handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml);
            }

            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    }
}