List of usage examples for org.apache.poi.hsmf.datatypes StringChunk getRawValue
public byte[] getRawValue()
From source file:mj.ocraptor.extraction.tika.parser.microsoft.OutlookExtractor.java
License:Apache License
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException { try {/*from ww w. ja v a 2s.co m*/ msg.setReturnNullOnMissingChunk(true); // If the message contains strings that aren't stored // as Unicode, try to sort out an encoding for them if (msg.has7BitEncodingStrings()) { if (msg.getHeaders() != null) { // There's normally something in the headers msg.guess7BitEncoding(); } else { // Nothing in the header, try encoding detection // on the message body StringChunk text = msg.getMainChunks().textBodyChunk; if (text != null) { CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match.getConfidence() > 35) { msg.set7BitEncoding(match.getName()); } } } } // Start with the metadata String subject = msg.getSubject(); String from = msg.getDisplayFrom(); metadata.set(TikaCoreProperties.CREATOR, from); metadata.set(Metadata.MESSAGE_FROM, from); metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); metadata.set(TikaCoreProperties.TITLE, subject); // TODO: Move to description in Tika 2.0 metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic()); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { if (recipientAddress != null) metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); } } catch (ChunkNotFoundException he) { } // Will be fixed in POI 3.7 Final // Date - try two ways to find it // First try via the proper chunk if (msg.getMessageDate() != null) { metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime()); } else { try { // Failing that try via the raw headers String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { for (String header : headers) { if (header.toLowerCase().startsWith("date:")) { String date = header.substring(header.indexOf(':') + 1).trim(); // See if we can parse it as a normal mail date try { Date d = MboxParser.parseDate(date); metadata.set(TikaCoreProperties.CREATED, d); metadata.set(TikaCoreProperties.MODIFIED, d); } catch (ParseException e) { // Store it as-is, and hope for the best... metadata.set(TikaCoreProperties.CREATED, date); metadata.set(TikaCoreProperties.MODIFIED, date); } break; } } } } catch (ChunkNotFoundException he) { // We can't find the date, sorry... } } xhtml.element("h1", subject); // Output the from and to details in text, as you // often want them in text form for searching xhtml.startElement("dl"); if (from != null) { header(xhtml, "From", from); } header(xhtml, "To", msg.getDisplayTo()); header(xhtml, "Cc", msg.getDisplayCC()); header(xhtml, "Bcc", msg.getDisplayBCC()); try { header(xhtml, "Recipients", msg.getRecipientEmailAddress()); } catch (ChunkNotFoundException e) { } xhtml.endElement("dl"); // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; Chunk rtfChunk = null; Chunk textChunk = null; for (Chunk chunk : msg.getMainChunks().getChunks()) { if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { htmlChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { rtfChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.BODY.id) { textChunk = chunk; } } boolean doneBody = false; xhtml.startElement("div", "class", "message-body"); if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { data = ((ByteChunk) htmlChunk).getValue(); } else if (htmlChunk instanceof StringChunk) { data = ((StringChunk) htmlChunk).getRawValue(); } if (data != null) { HtmlParser htmlParser = new HtmlParser(); htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true; } } if (rtfChunk != null && !doneBody) { ByteChunk chunk = (ByteChunk) rtfChunk; MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()); RTFParser rtfParser = new RTFParser(); rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), new ParseContext()); doneBody = true; } if (textChunk != null && !doneBody) { xhtml.element("p", ((StringChunk) textChunk).getValue()); } xhtml.endElement("div"); // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { xhtml.startElement("div", "class", "attachment-entry"); String filename = null; if (attachment.attachLongFileName != null) { filename = attachment.attachLongFileName.getValue(); } else if (attachment.attachFileName != null) { filename = attachment.attachFileName.getValue(); } if (filename != null && filename.length() > 0) { xhtml.element("h1", filename); } if (attachment.attachData != null) { handleEmbeddedResource(TikaInputStream.get(attachment.attachData.getValue()), filename, null, null, xhtml, true); } if (attachment.attachmentDirectory != null) { handleEmbeddedOfficeDoc(attachment.attachmentDirectory.getDirectory(), xhtml); } xhtml.endElement("div"); } } catch (ChunkNotFoundException e) { throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); } }
From source file:org.alfresco.repo.content.transform.MSGParser.java
License:Apache License
/** * Extracts properties and text from an Msg Document input stream. * * @param stream//w ww .j a va 2 s.c o m * the stream * @param handler * the handler * @param metadata * the metadata * @param context * the context * @throws IOException * Signals that an I/O exception has occurred. * @throws SAXException * the sAX exception * @throws TikaException * the tika exception */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { this.message = new MAPIMessage(new NPOIFSFileSystem(stream)); message.setReturnNullOnMissingChunk(true); // // If the message contains strings that aren't stored // // as Unicode, try to sort out an encoding for them if (message.has7BitEncodingStrings()) { if (message.getHeaders() != null) { // There's normally something in the headers message.guess7BitEncoding(); encoding = "utf-7"; } else { // Nothing in the header, try encoding detection // on the message body StringChunk text = message.getMainChunks().textBodyChunk; if (text != null) { CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match.getConfidence() > 35) { message.set7BitEncoding(match.getName()); encoding = match.getName(); } } } } else { encoding = UTF_8; } processHeader(message, metadata, xhtml); // real work. adaptedExtractMultipart(xhtml, message, context); xhtml.endDocument(); } catch (Exception e) { throw new TikaException("Error while processing message", e); } }
From source file:org.alfresco.repo.content.transform.MSGParser.java
License:Apache License
/** * Process header./*from ww w.jav a2 s .c om*/ * * @param msg * the msg * @param metadata * the metadata * @param xhtml * the xhtml * @throws Exception * the exception */ private void processHeader(MAPIMessage msg, Metadata metadata, XHTMLContentHandler xhtml) throws Exception { StringChunk subjectChunk = msg.getMainChunks().subjectChunk; if (msg.has7BitEncodingStrings()) { CharsetDetector detector = new CharsetDetector(); detector.setText(subjectChunk.getRawValue()); CharsetMatch detect = detector.detect(); if (detect.getConfidence() >= 20) { subjectChunk.set7BitEncoding(detect.getName()); } } String subject = subjectChunk.getValue(); String from = msg.getDisplayFrom(); metadata.set(DublinCore.CREATOR, from); metadata.set(Metadata.MESSAGE_FROM, from); metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); metadata.set(DublinCore.TITLE, subject); metadata.set(DublinCore.SUBJECT, msg.getConversationTopic()); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { if (recipientAddress != null) metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); } } catch (ChunkNotFoundException he) { } // Will be fixed in POI 3.7 Final // Date - try two ways to find it // First try via the proper chunk if (msg.getMessageDate() != null) { metadata.set(DublinCore.DATE, msg.getMessageDate().getTime()); metadata.set(Office.CREATION_DATE, msg.getMessageDate().getTime()); metadata.set(Office.SAVE_DATE, msg.getMessageDate().getTime()); } else { try { // Failing that try via the raw headers String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { for (String header : headers) { if (header.toLowerCase().startsWith("date:")) { String date = header.substring(header.indexOf(':') + 1).trim(); // See if we can parse it as a normal mail date try { Date d = MboxParser.parseDate(date); metadata.set(DublinCore.DATE, d); metadata.set(Office.CREATION_DATE, d); metadata.set(Office.SAVE_DATE, d); } catch (ParseException e) { // Store it as-is, and hope for the best... metadata.set(DublinCore.DATE, date); metadata.set(Office.CREATION_DATE, date); metadata.set(Office.SAVE_DATE, date); } break; } } } } catch (ChunkNotFoundException he) { // We can't find the date, sorry... } } xhtml.element("h1", subject); // Output the from and to details in text, as you // often want them in text form for searching xhtml.startElement("dl"); if (from != null) { header(xhtml, "From", from); } header(xhtml, "To", msg.getDisplayTo()); header(xhtml, "Cc", msg.getDisplayCC()); header(xhtml, "Bcc", msg.getDisplayBCC()); try { header(xhtml, "Recipients", msg.getRecipientEmailAddress()); } catch (ChunkNotFoundException e) { } List<String> attachmentList = new ArrayList<String>(); // // prepare attachments prepareExtractMultipart(xhtml, message, attachmentList); if (attachmentList.size() > 0) { header(xhtml, "Attachments", attachmentList.toString()); } xhtml.endElement("dl"); }
From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java
License:Apache License
/** * Tries to identify the correct encoding for 7-bit (non-unicode) * strings in the file./*from w w w . jav a 2 s . co m*/ * <p>Many messages store their strings as unicode, which is * nice and easy. Some use one-byte encodings for their * strings, but don't always store the encoding anywhere * helpful in the file.</p> * <p>This method checks for codepage properties, and failing that * looks at the headers for the message, and uses these to * guess the correct encoding for your file.</p> * <p>Bug #49441 has more on why this is needed</p> * <p>This is taken verbatim from POI (TIKA-1238) * as a temporary workaround to prevent unsupported encoding exceptions</p> */ private void guess7BitEncoding(MAPIMessage msg) { Chunks mainChunks = msg.getMainChunks(); //sanity check if (mainChunks == null) { return; } Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties(); if (props != null) { // First choice is a codepage property for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) { List<PropertyValue> val = props.get(prop); if (val != null && val.size() > 0) { int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue(); String encoding = null; try { encoding = CodePageUtil.codepageToEncoding(codepage, true); } catch (UnsupportedEncodingException e) { //swallow } if (tryToSet7BitEncoding(msg, encoding)) { return; } } } } // Second choice is a charset on a content type header try { String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { // Look for a content type with a charset Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); for (String header : headers) { if (header.startsWith("Content-Type")) { Matcher m = p.matcher(header); if (m.matches()) { // Found it! Tell all the string chunks String charset = m.group(1); if (tryToSet7BitEncoding(msg, charset)) { return; } } } } } } catch (ChunkNotFoundException e) { } // Nothing suitable in the headers, try HTML // TODO: do we need to replicate this in Tika? If we wind up // parsing the html version of the email, this is duplicative?? // Or do we need to reset the header strings based on the html // meta header if there is no other information? try { String html = msg.getHtmlBody(); if (html != null && html.length() > 0) { Charset charset = null; try { charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA); } catch (IOException e) { //swallow } if (charset != null && tryToSet7BitEncoding(msg, charset.name())) { return; } } } catch (ChunkNotFoundException e) { } //absolute last resort, try charset detector StringChunk text = mainChunks.textBodyChunk; if (text != null) { CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) { return; } } }