org.sonar.scanner.scan.filesystem.ByteCharsetDetector.java Source code

Introduction

Here is the source code for org.sonar.scanner.scan.filesystem.ByteCharsetDetector.java
Source

/*
 * SonarQube
 * Copyright (C) 2009-2017 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonar.scanner.scan.filesystem;

import static java.nio.charset.StandardCharsets.UTF_16;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.nio.charset.Charset;
import java.util.Arrays;

import javax.annotation.CheckForNull;

import org.apache.commons.io.ByteOrderMark;
import org.sonar.scanner.scan.filesystem.CharsetValidation.Result;
import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;

public class ByteCharsetDetector {
    // these needs to be sorted by longer first!
    private static final ByteOrderMark[] boms = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE,
            ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE };

    private Charset userConfiguration;
    private CharsetValidation validator;

    public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) {
        this.validator = validator;
        this.userConfiguration = userConfiguration;
    }

    @CheckForNull
    public Charset detect(byte[] buf) {
        // Try UTF-8 first since we are very confident in it if it's a yes.
        // Fail if we see nulls to not have FPs if the text is ASCII encoded in UTF-16.
        Result utf8Result = validator.isUTF8(buf, true);
        if (utf8Result.valid() == Validation.YES) {
            return utf8Result.charset();
        } else if (utf8Result.valid() == Validation.MAYBE) {
            return detectAscii(buf);
        }

        // try UTF16 with both endiness. Fail if we see nulls to not have FPs if it's UTF-32.
        Result utf16 = validator.isUTF16(buf, true);
        if (utf16.valid() == Validation.YES && validator.isValidUTF16(buf, UTF_16LE.equals(utf16.charset()))) {
            return utf16.charset();
        }

        // at this point we know it can't be UTF-8
        Charset c = userConfiguration;
        if (!UTF_8.equals(c) && (!isUtf16(c) || utf16.valid() == Validation.MAYBE) && validator.tryDecode(buf, c)) {
            return c;
        }

        return null;
    }

    private Charset detectAscii(byte[] buf) {
        if (!isUtf16Or32(userConfiguration) && validator.tryDecode(buf, userConfiguration)) {
            return userConfiguration;
        }

        return null;
    }

    private static boolean isUtf16(Charset charset) {
        return UTF_16.equals(charset) || UTF_16BE.equals(charset) || UTF_16LE.equals(charset);
    }

    private static boolean isUtf16Or32(Charset charset) {
        return isUtf16(charset) || MetadataGenerator.UTF_32BE.equals(charset)
                || MetadataGenerator.UTF_32LE.equals(charset);
    }

    @CheckForNull
    public ByteOrderMark detectBOM(byte[] buffer) {
        return Arrays.stream(boms).filter(b -> isBom(b, buffer)).findAny().orElse(null);
    }

    private static boolean isBom(ByteOrderMark bom, byte[] buffer) {
        if (buffer.length < bom.length()) {
            return false;
        }
        for (int i = 0; i < bom.length(); i++) {
            if ((byte) bom.get(i) != buffer[i]) {
                return false;
            }
        }
        return true;
    }

}