Examine a stream of text and make a judgement on what encoding type should be used for the text. - Java File Path IO

Java examples for File Path IO:File Operation

Description

Examine a stream of text and make a judgement on what encoding type should be used for the text.

Demo Code

/*// w w  w . j  a v  a2 s .  c o  m
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
//package com.java2s;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;

public class Main {
    /**
     * Examine a stream of text and make a judgement on what encoding
     * type should be used for the text.  Ideally, we want to use 7bit
     * encoding to determine this, but we may need to use either quoted-printable
     * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
     *
     * @param content     An input stream for the content we're examining.
     *
     * @exception IOException
     */
    public static String getTextTransferEncoding(InputStream content)
            throws IOException {

        // for efficiency, we'll read in blocks.
        BufferedInputStream in = new BufferedInputStream(content, 4096);

        int span = 0; // span of characters without a line break.
        boolean containsLongLines = false;
        int asciiChars = 0;
        int nonAsciiChars = 0;

        while (true) {
            int ch = in.read();
            // if we hit an EOF here, go decide what type we've actually found.
            if (ch == -1) {
                break;
            }

            // we found a linebreak.  Reset the line length counters on either one.  We don't
            // really need to validate here.
            if (ch == '\n' || ch == '\r') {
                // hit a line end, reset our line length counter
                span = 0;
            } else {
                span++;
                // the text has long lines, we can't transfer this as unencoded text.
                if (span > 998) {
                    containsLongLines = true;
                }

                // non-ascii character, we have to transfer this in binary.
                if (!isAscii(ch)) {
                    nonAsciiChars++;
                } else {
                    asciiChars++;
                }
            }
        }

        // looking good so far, only valid chars here.
        if (nonAsciiChars == 0) {
            // does this contain long text lines?  We need to use a Q-P encoding which will
            // be only slightly longer, but handles folding the longer lines.
            if (containsLongLines) {
                return "quoted-printable";
            } else {
                // ideal!  Easiest one to handle.
                return "7bit";
            }
        } else {
            // mostly characters requiring encoding?  Base64 is our best bet.
            if (nonAsciiChars > asciiChars) {
                return "base64";
            } else {
                // Q-P encoding will use fewer bytes than the full Base64.
                return "quoted-printable";
            }
        }
    }

    /**
     * Examine a stream of text and make a judgement on what encoding
     * type should be used for the text.  Ideally, we want to use 7bit
     * encoding to determine this, but we may need to use either quoted-printable
     * or base64.  The choice is made on the ratio of 7-bit characters to non-7bit.
     *
     * @param content     A string for the content we're examining.
     */
    public static String getTextTransferEncoding(String content) {

        int asciiChars = 0;
        int nonAsciiChars = 0;

        for (int i = 0; i < content.length(); i++) {
            int ch = content.charAt(i);

            // non-ascii character, we have to transfer this in binary.
            if (!isAscii(ch)) {
                nonAsciiChars++;
            } else {
                asciiChars++;
            }
        }

        // looking good so far, only valid chars here.
        if (nonAsciiChars == 0) {
            // ideal!  Easiest one to handle.
            return "7bit";
        } else {
            // mostly characters requiring encoding?  Base64 is our best bet.
            if (nonAsciiChars > asciiChars) {
                return "base64";
            } else {
                // Q-P encoding will use fewer bytes than the full Base64.
                return "quoted-printable";
            }
        }
    }

    /**
     * Test to see if this string contains only US-ASCII (i.e., 7-bit
     * ASCII) Characters.
     *
     * @param s      The test string.
     *
     * @return true if this is a valid 7-bit ASCII encoding, false if it
     *         contains any non-US ASCII characters.
     */
    static public boolean isAscii(String s) {
        for (int i = 0; i < s.length(); i++) {
            if (!isAscii(s.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Test to see if a given character can be considered "valid" ASCII.
     * The excluded characters are the control characters less than
     * 32, 8-bit characters greater than 127, EXCEPT the CR, LF and
     * tab characters ARE considered value (all less than 32).
     *
     * @param ch     The test character.
     *
     * @return true if this character meets the "ascii-ness" criteria, false
     *         otherwise.
     */
    static public boolean isAscii(int ch) {
        // these are explicitly considered valid.
        if (ch == '\r' || ch == '\n' || ch == '\t') {
            return true;
        }

        // anything else outside the range is just plain wrong.
        if (ch >= 127 || ch < 32) {
            return false;
        }
        return true;
    }
}

Related Tutorials