Java ByteBuffer Set getCharsetFromDocument(ByteBuffer bb)

Here you can find the source of getCharsetFromDocument(ByteBuffer bb)

Description

Get charset from a document.

License

Open Source License

Parameter

Parameter Description
bb The document to search

Return

The charset, if found. Otherwise null.

Declaration

private static String getCharsetFromDocument(ByteBuffer bb) 

Method Source Code


//package com.java2s;
/*   _______ __ __                    _______                    __
 *  |     __|__|  |.--.--.-----.----.|_     _|.----.-----.--.--.|  |_
 *  |__     |  |  ||  |  |  -__|   _|  |   |  |   _|  _  |  |  ||   _|
 *  |_______|__|__| \___/|_____|__|    |___|  |__| |_____|_____||____|
 *
 *  Copyright 2008 - Gustav Tiger, Henrik Steen and Gustav "Gussoh" Sohtell
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version./*from ww w . j a  va 2  s  .  c  o  m*/
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

import java.nio.ByteBuffer;

import java.util.regex.Pattern;
import java.util.regex.Matcher;

public class Main {
    private static final String fallbackCharset = "iso-8859-1";

    /**
     * Get charset from a document.
     *
     * This function searches the document for meta tags and parses them, and
     * searches for a content-type httpequiv with some charset.
     *
     * @param bb  The document to search
     *
     * @return    The charset, if found. Otherwise null.
     *
     */
    private static String getCharsetFromDocument(ByteBuffer bb) {
        String patternMeta = "(?i)<meta\\s([^>]*)>";
        Pattern pm = Pattern.compile(patternMeta);
        Matcher mm;
        try {
            mm = pm.matcher(new String(bb.array(), fallbackCharset));
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }

        while (mm.find()) {

            String patternAttrib = "(?i)([a-z\\-]+)=(\"|')([^\"|']*)(\"|')";
            Pattern pa = Pattern.compile(patternAttrib);
            Matcher ma = pa.matcher(mm.group(1));

            System.out.println(mm.group(1));

            String httpEquiv = null, content = null;
            while (ma.find()) {
                System.out.println(ma.group(1) + ": " + ma.group(3));
                if (ma.group(1).equalsIgnoreCase("http-equiv")) {
                    httpEquiv = ma.group(3);
                } else if (ma.group(1).equalsIgnoreCase("content")) {
                    content = ma.group(3);
                }
            }

            if (httpEquiv != null && content != null) {
                if (httpEquiv.equalsIgnoreCase("Content-Type")) {
                    System.out.println("Found charset in meta");
                    System.out.println(httpEquiv + ", " + content);
                    return getCharset(content);
                }
            }
        }
        return null;
    }

    /**
     * Extracts a charset from a Content-Type.
     *
     * @param  contentType  The Content-Type to extract from
     * @return              The extracted charset. Null if no charset was found.
     *
     */
    private static String getCharset(String contentType) {
        String[] parameters = contentType.split(";");

        for (int i = 1; i < parameters.length; i++) {
            String parameter = parameters[i];
            if (parameter.indexOf('=') != -1) {
                int split = parameter.indexOf('=');
                String key = parameter.substring(0, split).trim();
                String value = parameter.substring(split + 1).trim();

                if (key.equalsIgnoreCase("charset")) {
                    // XXX: This should be enough for quoted strings. We should
                    // not have any special chars in our charsets.
                    if (value.startsWith("\"")) {
                        return value.substring(1, value.length() - 1);
                    }
                    return value;
                }
            }
        }
        return null;
    }
}

Related

  1. find(ByteBuffer buffer, int offset, byte searchKey)
  2. findCommonPrefix(ByteBuffer buffer, int offsetLeft, int offsetRight, int limit)
  3. from(ByteBuffer buffer, int offset)
  4. fromListToSetByteArray(List list)
  5. get(ByteBuffer srcBuffer, byte[] dstBytes, int dstOffset, int length)
  6. getEquals(ByteBuffer buf, String s, String charsetName)
  7. getSignedInt(ByteBuffer buffer, int offset)
  8. hash_murmur3_128(ByteBuffer buf, int offset, int size, int i, byte[] result)
  9. isFree(int frameIx, int offset, ByteBuffer[] frames)