List of usage examples for org.apache.poi.hsmf.datatypes MAPIProperty BODY
MAPIProperty BODY
To view the source code for org.apache.poi.hsmf.datatypes MAPIProperty BODY.
Click Source Link
From source file:mj.ocraptor.extraction.tika.parser.microsoft.OutlookExtractor.java
License:Apache License
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException { try {//from w ww . ja v a 2 s . c om msg.setReturnNullOnMissingChunk(true); // If the message contains strings that aren't stored // as Unicode, try to sort out an encoding for them if (msg.has7BitEncodingStrings()) { if (msg.getHeaders() != null) { // There's normally something in the headers msg.guess7BitEncoding(); } else { // Nothing in the header, try encoding detection // on the message body StringChunk text = msg.getMainChunks().textBodyChunk; if (text != null) { CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match.getConfidence() > 35) { msg.set7BitEncoding(match.getName()); } } } } // Start with the metadata String subject = msg.getSubject(); String from = msg.getDisplayFrom(); metadata.set(TikaCoreProperties.CREATOR, from); metadata.set(Metadata.MESSAGE_FROM, from); metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); metadata.set(TikaCoreProperties.TITLE, subject); // TODO: Move to description in Tika 2.0 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic()); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { if (recipientAddress != null) metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); } } catch (ChunkNotFoundException he) { } // Will be fixed in POI 3.7 Final // Date - try two ways to find it // First try via the proper chunk if (msg.getMessageDate() != null) { metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); } else { try { // Failing that try via the raw headers String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { for (String header : headers) { if (header.toLowerCase().startsWith("date:")) { String date = header.substring(header.indexOf(':') + 1).trim(); // See if we can parse it as a normal mail date try { Date d = MboxParser.parseDate(date); metadata.set(TikaCoreProperties.CREATED, d); metadata.set(TikaCoreProperties.MODIFIED, d); } catch (ParseException e) { // Store it as-is, and hope for the best... metadata.set(TikaCoreProperties.CREATED, date); metadata.set(TikaCoreProperties.MODIFIED, date); } break; } } } } catch (ChunkNotFoundException he) { // We can't find the date, sorry... } } xhtml.element("h1", subject); // Output the from and to details in text, as you // often want them in text form for searching xhtml.startElement("dl"); if (from != null) { header(xhtml, "From", from); } header(xhtml, "To", msg.getDisplayTo()); header(xhtml, "Cc", msg.getDisplayCC()); header(xhtml, "Bcc", msg.getDisplayBCC()); try { header(xhtml, "Recipients", msg.getRecipientEmailAddress()); } catch (ChunkNotFoundException e) { } xhtml.endElement("dl"); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; Chunk rtfChunk = null; Chunk textChunk = null; for (Chunk chunk : msg.getMainChunks().getChunks()) { if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { htmlChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { rtfChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.BODY.id) { textChunk = chunk; } } boolean doneBody = false; xhtml.startElement("div", "class", "message-body"); if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { data = ((ByteChunk) htmlChunk).getValue(); } else if (htmlChunk instanceof StringChunk) { data = ((StringChunk) htmlChunk).getRawValue(); } if (data != null) { HtmlParser htmlParser = new HtmlParser(); htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true; } } if (rtfChunk != null && !doneBody) { ByteChunk chunk = (ByteChunk) rtfChunk; MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()); RTFParser rtfParser = new RTFParser(); rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true; } if (textChunk != null && !doneBody) { xhtml.element("p", ((StringChunk) textChunk).getValue()); } xhtml.endElement("div"); // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { xhtml.startElement("div", "class", "attachment-entry"); String filename = null; if (attachment.attachLongFileName != null) { filename = attachment.attachLongFileName.getValue(); } else if (attachment.attachFileName != null) { filename = attachment.attachFileName.getValue(); } if (filename != null && filename.length() > 0) { xhtml.element("h1", filename); } if (attachment.attachData != null) { handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null, null, xhtml, true); } if (attachment.attachmentDirectory != null) { handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml); } xhtml.endElement("div"); } } catch (ChunkNotFoundException e) { throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); } }
From source file:org.alfresco.repo.content.transform.MSGParser.java
License:Apache License
/** * Adapted extract multipart is parser that extracts the html body if exists, rtf body if exists * or at least plain text. The html or rtf file could be obtained as alternative. * * @param xhtml//from w ww . j a va 2 s.co m * the xhtml * @param msg * the message part * @param context * the context * @throws MessagingException * the messaging exception * @throws IOException * Signals that an I/O exception has occurred. * @throws SAXException * the sAX exception * @throws TikaException * the tika exception */ public void adaptedExtractMultipart(XHTMLContentHandler xhtml, MAPIMessage msg, ParseContext context) throws MessagingException, IOException, SAXException, TikaException { // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; Chunk rtfChunk = null; Chunk textChunk = null; for (Chunk chunk : msg.getMainChunks().getAll()) { if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { htmlChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { rtfChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.BODY.id) { textChunk = chunk; } } boolean doneBody = false; if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { data = ((ByteChunk) htmlChunk).getValue(); } else if (htmlChunk instanceof StringChunk) { data = ((StringChunk) htmlChunk).getRawValue(); } File tempHtmlFile = new File(workingDirectory, System.currentTimeMillis() + ".html"); BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempHtmlFile)); byte[] preparedStringData = referencesCache.size() > 0 ? prepareHTMLString(new String(data)).getBytes() : data; IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream); IOUtils.closeQuietly(rtfOutStream); parsedContent.put(MimetypeMap.MIMETYPE_HTML, new Pair<File, String>(tempHtmlFile, encoding)); doneBody = true; } if (rtfChunk != null && !doneBody) { ByteChunk chunk = (ByteChunk) rtfChunk; MAPIProperty property = MAPIProperty.RTF_COMPRESSED; int type = Types.BINARY.getId(); byte[] data = chunk.getValue(); MAPIRtfAttribute rtf = new MAPIRtfAttribute(property, type, data); File tempRtfFile = new File(workingDirectory, System.currentTimeMillis() + ".rtf"); BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempRtfFile)); byte[] preparedStringData = referencesCache.size() > 0 ? prepareRTFString(new String(rtf.getData())).getBytes() : rtf.getData(); IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream); IOUtils.closeQuietly(rtfOutStream); parsedContent.put(MIMETYPE_RTF, new Pair<File, String>(tempRtfFile, encoding)); doneBody = true; } if (textChunk != null && !doneBody) { xhtml.element("p", ((StringChunk) textChunk).getValue()); } }
From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java
License:Apache License
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException { try {/*from w w w. j ava 2 s . c o m*/ msg.setReturnNullOnMissingChunk(true); // If the message contains strings that aren't stored // as Unicode, try to sort out an encoding for them if (msg.has7BitEncodingStrings()) { guess7BitEncoding(msg); } // Start with the metadata String subject = msg.getSubject(); String from = msg.getDisplayFrom(); metadata.set(TikaCoreProperties.CREATOR, from); metadata.set(Metadata.MESSAGE_FROM, from); metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); metadata.set(TikaCoreProperties.TITLE, subject); // TODO: Move to description in Tika 2.0 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic()); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { if (recipientAddress != null) metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); } } catch (ChunkNotFoundException he) { } // Will be fixed in POI 3.7 Final // Date - try two ways to find it // First try via the proper chunk if (msg.getMessageDate() != null) { metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); } else { try { // Failing that try via the raw headers String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { for (String header : headers) { if (header.toLowerCase(Locale.ROOT).startsWith("date:")) { String date = header.substring(header.indexOf(':') + 1).trim(); // See if we can parse it as a normal mail date try { Date d = MboxParser.parseDate(date); metadata.set(TikaCoreProperties.CREATED, d); metadata.set(TikaCoreProperties.MODIFIED, d); } catch (ParseException e) { // Store it as-is, and hope for the best... metadata.set(TikaCoreProperties.CREATED, date); metadata.set(TikaCoreProperties.MODIFIED, date); } break; } } } } catch (ChunkNotFoundException he) { // We can't find the date, sorry... } } xhtml.element("h1", subject); // Output the from and to details in text, as you // often want them in text form for searching xhtml.startElement("dl"); if (from != null) { header(xhtml, "From", from); } header(xhtml, "To", msg.getDisplayTo()); header(xhtml, "Cc", msg.getDisplayCC()); header(xhtml, "Bcc", msg.getDisplayBCC()); try { header(xhtml, "Recipients", msg.getRecipientEmailAddress()); } catch (ChunkNotFoundException e) { } xhtml.endElement("dl"); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; Chunk rtfChunk = null; Chunk textChunk = null; for (Chunk chunk : msg.getMainChunks().getChunks()) { if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { htmlChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { rtfChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.BODY.id) { textChunk = chunk; } } boolean doneBody = false; xhtml.startElement("div", "class", "message-body"); if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { data = ((ByteChunk) htmlChunk).getValue(); } else if (htmlChunk instanceof StringChunk) { data = ((StringChunk) htmlChunk).getRawValue(); } if (data != null) { HtmlParser htmlParser = new HtmlParser(); htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true; } } if (rtfChunk != null && !doneBody) { ByteChunk chunk = (ByteChunk) rtfChunk; MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()); RTFParser rtfParser = new RTFParser(); rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true; } if (textChunk != null && !doneBody) { xhtml.element("p", ((StringChunk) textChunk).getValue()); } xhtml.endElement("div"); // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { xhtml.startElement("div", "class", "attachment-entry"); String filename = null; if (attachment.attachLongFileName != null) { filename = attachment.attachLongFileName.getValue(); } else if (attachment.attachFileName != null) { filename = attachment.attachFileName.getValue(); } if (filename != null && filename.length() > 0) { xhtml.element("h1", filename); } if (attachment.attachData != null) { handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null, null, xhtml, true); } if (attachment.attachmentDirectory != null) { handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml); } xhtml.endElement("div"); } } catch (ChunkNotFoundException e) { throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); } }