Example usage for org.apache.poi.util CodePageUtil codepageToEncoding

List of usage examples for org.apache.poi.util CodePageUtil codepageToEncoding

Introduction

In this page you can find the example usage for org.apache.poi.util CodePageUtil codepageToEncoding.

Prototype

public static String codepageToEncoding(final int codepage, boolean javaLangFormat)
        throws UnsupportedEncodingException 

Source Link

Document

Turns a codepage number into the equivalent character encoding's name, in either Java NIO or Java Lang canonical naming.

Usage

From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

/**
 * Tries to identify the correct encoding for 7-bit (non-unicode)
 *  strings in the file.//from   w  w  w . j av a 2  s  .c o  m
 * <p>Many messages store their strings as unicode, which is
 *  nice and easy. Some use one-byte encodings for their
 *  strings, but don't always store the encoding anywhere
 *  helpful in the file.</p>
 * <p>This method checks for codepage properties, and failing that
 *  looks at the headers for the message, and uses these to
 *  guess the correct encoding for your file.</p>
 * <p>Bug #49441 has more on why this is needed</p>
 * <p>This is taken verbatim from POI (TIKA-1238)
 * as a temporary workaround to prevent unsupported encoding exceptions</p>
 */
private void guess7BitEncoding(MAPIMessage msg) {
    Chunks mainChunks = msg.getMainChunks();
    //sanity check
    if (mainChunks == null) {
        return;
    }

    Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
    if (props != null) {
        // First choice is a codepage property
        for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE,
                MAPIProperty.INTERNET_CPID }) {
            List<PropertyValue> val = props.get(prop);
            if (val != null && val.size() > 0) {
                int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
                String encoding = null;
                try {
                    encoding = CodePageUtil.codepageToEncoding(codepage, true);
                } catch (UnsupportedEncodingException e) {
                    //swallow
                }
                if (tryToSet7BitEncoding(msg, encoding)) {
                    return;
                }
            }
        }
    }

    // Second choice is a charset on a content type header
    try {
        String[] headers = msg.getHeaders();
        if (headers != null && headers.length > 0) {
            // Look for a content type with a charset
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
                    Pattern.CASE_INSENSITIVE);

            for (String header : headers) {
                if (header.startsWith("Content-Type")) {
                    Matcher m = p.matcher(header);
                    if (m.matches()) {
                        // Found it! Tell all the string chunks
                        String charset = m.group(1);
                        if (tryToSet7BitEncoding(msg, charset)) {
                            return;
                        }
                    }
                }
            }
        }
    } catch (ChunkNotFoundException e) {
    }

    // Nothing suitable in the headers, try HTML
    // TODO: do we need to replicate this in Tika? If we wind up
    // parsing the html version of the email, this is duplicative??
    // Or do we need to reset the header strings based on the html
    // meta header if there is no other information?
    try {
        String html = msg.getHtmlBody();
        if (html != null && html.length() > 0) {
            Charset charset = null;
            try {
                charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA);
            } catch (IOException e) {
                //swallow
            }
            if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
                return;
            }
        }
    } catch (ChunkNotFoundException e) {
    }

    //absolute last resort, try charset detector
    StringChunk text = mainChunks.textBodyChunk;
    if (text != null) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(text.getRawValue());
        CharsetMatch match = detector.detect();
        if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
            return;
        }
    }
}