List of usage examples for org.apache.poi.hsmf.datatypes MAPIProperty MESSAGE_CODEPAGE
MAPIProperty MESSAGE_CODEPAGE
To view the source code for org.apache.poi.hsmf.datatypes MAPIProperty MESSAGE_CODEPAGE.
Click Source Link
From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java
License:Apache License
/** * Tries to identify the correct encoding for 7-bit (non-unicode) * strings in the file./*from w w w.ja va 2s .c om*/ * <p>Many messages store their strings as unicode, which is * nice and easy. Some use one-byte encodings for their * strings, but don't always store the encoding anywhere * helpful in the file.</p> * <p>This method checks for codepage properties, and failing that * looks at the headers for the message, and uses these to * guess the correct encoding for your file.</p> * <p>Bug #49441 has more on why this is needed</p> * <p>This is taken verbatim from POI (TIKA-1238) * as a temporary workaround to prevent unsupported encoding exceptions</p> */ private void guess7BitEncoding(MAPIMessage msg) { Chunks mainChunks = msg.getMainChunks(); //sanity check if (mainChunks == null) { return; } Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties(); if (props != null) { // First choice is a codepage property for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) { List<PropertyValue> val = props.get(prop); if (val != null && val.size() > 0) { int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue(); String encoding = null; try { encoding = CodePageUtil.codepageToEncoding(codepage, true); } catch (UnsupportedEncodingException e) { //swallow } if (tryToSet7BitEncoding(msg, encoding)) { return; } } } } // Second choice is a charset on a content type header try { String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { // Look for a content type with a charset Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); for (String header : headers) { if (header.startsWith("Content-Type")) { Matcher m = p.matcher(header); if (m.matches()) { // Found it! Tell all the string chunks String charset = m.group(1); if (tryToSet7BitEncoding(msg, charset)) { return; } } } } } } catch (ChunkNotFoundException e) { } // Nothing suitable in the headers, try HTML // TODO: do we need to replicate this in Tika? If we wind up // parsing the html version of the email, this is duplicative?? // Or do we need to reset the header strings based on the html // meta header if there is no other information? try { String html = msg.getHtmlBody(); if (html != null && html.length() > 0) { Charset charset = null; try { charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA); } catch (IOException e) { //swallow } if (charset != null && tryToSet7BitEncoding(msg, charset.name())) { return; } } } catch (ChunkNotFoundException e) { } //absolute last resort, try charset detector StringChunk text = mainChunks.textBodyChunk; if (text != null) { CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) { return; } } }