Example usage for java.text Normalizer isNormalized

List of usage examples for java.text Normalizer isNormalized

Introduction

In this page you can find the example usage for java.text Normalizer isNormalized.

Prototype

public static boolean isNormalized(CharSequence src, Form form) 

Source Link

Document

Determines if the given sequence of char values is normalized.

Usage

From source file:org.w3.i18n.ParsedDocument.java

public ParsedDocument(DocumentResource documentResource) {
    if (documentResource == null) {
        throw new NullPointerException("documentResource: " + documentResource);
    }/*from  w w  w .ja  va 2 s.c o m*/
    // Prepare resources:
    this.documentResource = documentResource;
    // TODO: Currently a blocking operation.
    byte[] documentBodyBytes;
    try {
        documentBodyBytes = IOUtils.toByteArray(documentResource.getBody());
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }

    this.byteOrderMark = documentBodyBytes.length <= 5 ? null
            : Utils.findByteOrderMark(Arrays.copyOf(documentBodyBytes, 4));

    this.documentBody = new String(documentBodyBytes,
            byteOrderMark == null ? Charset.forName("UTF-8") : Charset.forName(byteOrderMark.getCharsetName()));

    // Use the HTML parser on the document body.
    this.source = new Source(documentBody);
    this.source.fullSequentialParse();

    // Process resources to find information.

    // Pattern for finding charset in "http-equiv=Content-Type" meta tags.
    Pattern contentTypeCharsetPattern = Pattern.compile("charset\\s*=\\s*([^\\s;]+)");

    // Document Type Declaration (DOCTYPE).
    // NB: In a valid document there is only one DTD.
    StartTag doctypeStartTag = source.getFirstStartTag(StartTagType.DOCTYPE_DECLARATION);
    this.doctypeTag = doctypeStartTag == null ? null : doctypeStartTag.toString().trim();
    this.doctypeClassification = classifyDoctype(doctypeTag);

    // XML Declaration ('?xml' tag at the start of the document).
    StartTag xmlStartTag = source.getFirstStartTag(StartTagType.XML_DECLARATION);
    if (xmlStartTag != null) {
        this.xmlDeclaration = xmlStartTag.toString().trim();
        Matcher charsetMatcher = Pattern.compile("encoding\\s*=\\s*'?\"?\\s*([^\"'\\s\\?>]+)")
                .matcher(this.xmlDeclaration);
        this.charsetXmlDeclaration = charsetMatcher.find() ? charsetMatcher.group(1) : null;
    } else {
        this.xmlDeclaration = null;
        this.charsetXmlDeclaration = null;
    }

    // Find all charset declarations in meta tags.
    this.charsetMetaTags = new TreeMap<>();
    this.charsetMetaTagsOutside1024 = new ArrayList<>();
    List<Element> metaElements = source.getAllElements("meta");
    for (Element metaElement : metaElements) {
        String charset = null;
        // Look for a "<meta charset="..." >" tag.
        if (metaElement.getAttributeValue("charset") != null) {
            charset = metaElement.getAttributeValue("charset").trim();
        } // Look for a "<meta http-equiv="Content-Type" ... >" tag.
        else {
            String httpEquiv = metaElement.getAttributeValue("http-equiv");
            String content = metaElement.getAttributeValue("content");
            if (httpEquiv != null && content != null && httpEquiv.equalsIgnoreCase("Content-Type")) {
                Matcher m = contentTypeCharsetPattern.matcher(content);
                if (m.find()) {
                    charset = m.group(1);
                }
            }
        }
        // If a charset declaration was found, add this tag to the list.
        if (charset != null) {
            charset = charset.trim();
            if (!charset.isEmpty()) {
                String tag = metaElement.getStartTag().toString().trim();
                if (!charsetMetaTags.containsKey(charset)) {
                    charsetMetaTags.put(charset, new ArrayList<String>());
                }
                charsetMetaTags.get(charset).add(tag);
                if (metaElement.getEnd() > 1024) {
                    charsetMetaTagsOutside1024.add(metaElement.getStartTag().toString());
                }
            }
        }
    }

    // Find the 'Content-Type' HTTP response header and process it.
    this.contentType = documentResource.getHeader("Content-Type");
    if (contentType != null
            /* TODO: DEBUG! This is a workaround for passing tests that
             * don't detect a bug in the old checker. See: 
             * http://qa-dev.w3.org/i18n-checker-test/check.php?uri=http%3A%
             * 2F%2Fwww.w3.org%2FInternational%2Ftests%2Fi18n-checker%2Fgene
             * rate%3Ftest%3D24%26format%3Dhtml%26serveas%3Dhtml
             * ~~~ Joe (Joseph.J.Short@gmail.com) */
            && !contentType.equals("text/html;; charset=UTF-8")) {
        Matcher m = contentTypeCharsetPattern.matcher(contentType);
        this.charsetHttp = m.find() ? m.group(1) : null;
        this.servedAsXml = contentType.contains("application/xhtml+xml");
    } else {
        this.charsetHttp = null;
        this.servedAsXml = false;
    }

    // Find the opening 'html' tag and look for some choice attributes.
    Element htmlElement = source.getFirstElement("html");
    if (htmlElement != null) {
        this.openingHtmlTag = htmlElement.getStartTag().toString();
        this.openingHtmlTagLang = htmlElement.getAttributeValue("lang");
        this.openingHtmlTagXmlLang = htmlElement.getAttributeValue("xml:lang");
        this.defaultDir = htmlElement.getAttributeValue("dir");
    } else {
        this.openingHtmlTag = null;
        this.openingHtmlTagLang = null;
        this.openingHtmlTagXmlLang = null;
        this.defaultDir = null;
    }

    // Find the 'Content-Language' HTTP response header.
    this.contentLanguage = documentResource.getHeader("Content-Language");

    // Find a 'meta' tag with 'http-equiv="Content-Language"'.
    /* TODO: Change this to a similar structure that the charset meta tags
     * are stored in. */
    {
        int i = 0;
        String langMetaS = null;
        while (langMetaS == null && i < metaElements.size()) {
            if (metaElements.get(i).getAttributeValue("http-equiv") != null && metaElements.get(i)
                    .getAttributeValue("http-equiv").equalsIgnoreCase("Content-Language")) {
                // NB: langMetaS will still be null if there is no content.
                langMetaS = metaElements.get(i).getAttributeValue("content");
            }
            i++;
        }
        this.langMeta = langMetaS;
    }

    // Find class and id names that are non-ASCII or non-NFC.
    this.allNonNfcClassIdNames = new TreeSet<>();
    this.allNonNfcClassIdTags = new ArrayList<>();
    Set<Element> nonNfcClassIdNamesElements = new LinkedHashSet<>();
    CharsetEncoder usAsciiEncoder = Charset.forName("US-ASCII").newEncoder();
    for (Element element : source.getAllElements()) {
        Set<String> names = new TreeSet<>();
        String classAttr = element.getAttributeValue("class");
        String idAttr = element.getAttributeValue("id");
        if (classAttr != null) {
            for (String className : classAttr.split(" ")) {
                if (!className.isEmpty()) {
                    names.add(className);
                }
            }
        }
        if (idAttr != null) {
            String id = idAttr.trim();
            if (!id.isEmpty()) {
                names.add(id);
            }
        }
        boolean nonNfcAscii = false;
        for (String name : names) {
            if (// If non-ASCII
            !usAsciiEncoder.canEncode(name)
                    // ... or non-NFC (Unicode normalisation):
                    || !Normalizer.isNormalized(name, Normalizer.Form.NFC)) {
                nonNfcAscii = true;
                allNonNfcClassIdNames.add(name);
            }
        }
        if (nonNfcAscii) {
            nonNfcClassIdNamesElements.add(element);
        }
    }
    for (Element element : nonNfcClassIdNamesElements) {
        this.allNonNfcClassIdTags.add(element.getStartTag().toString());
    }

    // Find any BOMs in the content.
    this.bomsInContent = new ArrayList<>();
    for (int i = 1; i < documentBodyBytes.length - 5; i++) {
        ByteOrderMark bom = Utils.findByteOrderMark(Arrays.copyOfRange(documentBodyBytes, i, i + 5));
        if (bom != null) {
            // Add a context of 15 characters either side to the list.
            int startofContext = Math.max(0, i - 15);
            int endOfContext = Math.min(documentBodyBytes.length - 1, i + 20);
            try {
                /* The context will look something like:
                 * " ... comes the BOM /???/. Ok, test that. ... "
                 * 
                 *  A BOM encoded in US-ASCII looks something like "???"
                 * (depending on the number of code points it uses). */
                bomsInContent.add((startofContext == 0 ? "\"" : "\" ... ")
                        + new String(Arrays.copyOfRange(documentBodyBytes, startofContext, endOfContext),
                                "US-ASCII").replaceAll("\\s+", " ")
                        + (endOfContext == documentBodyBytes.length - 1 ? "\"" : " ... \""));
            } catch (UnsupportedEncodingException ex) {
                throw new RuntimeException(ex);
            }
            i += 2;
        }
    }

    // Use the BOM to determine whether the document is in UTF-16.
    // NB: This is behaviour copied accross from the old project.
    if (byteOrderMark == null) {
        this.utf16 = false;
    } else {
        this.utf16 = byteOrderMark.getCharsetName().toUpperCase().matches(".*UTF-16.*");
    }

    // Find all 'a' and 'link' tags with a 'charset' attribute.
    this.charsetLinkTags = new ArrayList<>();
    for (Element element : source.getAllElements()) {
        if ((element.getName().toLowerCase().equals("a") || element.getName().toLowerCase().equals("link"))
                && element.getAttributeValue("charset") != null) {
            this.charsetLinkTags.add(element.getStartTag().toString().trim());
        }
    }

    // Find 'bdo' tags without 'dir' attributes.
    this.bdoTagsWithoutDir = new ArrayList<>();
    for (Element element : source.getAllElements("bdo")) {
        if (element.getAttributeValue("dir") == null) {
            bdoTagsWithoutDir.add(element.getStartTag().toString().trim());
        }
    }

    // Find all 'b' and 'i' tags without a class name.
    this.bITagsWithoutClass = new ArrayList<>();
    for (Element element : source.getAllElements()) {
        if ((element.getName().toLowerCase().equals("b") || element.getName().toLowerCase().equals("i"))) {
            String classAttr = element.getAttributeValue("class");
            if (classAttr == null || classAttr.trim().isEmpty()) {
                String context = element.toString();
                if (context.length() > 15) {
                    context = context.substring(0, 14) + " ... ";
                }
                bITagsWithoutClass.add("\"" + context + "\"");
            }
        }
    }

    // Make aggregates of charset declarations.
    this.allCharsetDeclarations = new TreeSet<>();
    this.inDocCharsetDeclarations = new TreeSet<>();
    if (this.charsetHttp != null) {
        String d = this.charsetHttp.trim().toUpperCase();
        this.allCharsetDeclarations.add(d);
    }
    if (this.byteOrderMark != null) {
        String d = this.byteOrderMark.getCharsetName().toUpperCase();
        this.allCharsetDeclarations.add(d);
        this.inDocCharsetDeclarations.add(d);
    }
    if (this.charsetXmlDeclaration != null) {
        String d = this.charsetXmlDeclaration.trim().toUpperCase();
        this.allCharsetDeclarations.add(d);
        this.inDocCharsetDeclarations.add(d);
    }
    for (String charset : charsetMetaTags.keySet()) {
        this.allCharsetDeclarations.add(charset.toUpperCase());
        this.inDocCharsetDeclarations.add(charset.toUpperCase());
    }
    this.nonUtf8CharsetDeclarations = new TreeSet<>();
    for (String charsetDeclaration : this.allCharsetDeclarations) {
        if (!charsetDeclaration.equalsIgnoreCase("UTF-8")) {
            nonUtf8CharsetDeclarations.add(charsetDeclaration);
        }
    }

    // Make aggregates of language declarations.
    this.allConflictingLangAttributes = new ArrayList<>();
    this.allLangAttributes = new TreeSet<>();
    this.allXmlLangAttributes = new TreeSet<>();
    this.allLangAttributeTags = new ArrayList<>();
    this.allXmlLangAttributeTags = new ArrayList<>();
    for (Element element : source.getAllElements()) {
        String langAttr = element.getAttributeValue("lang");
        String xmlLangAttr = element.getAttributeValue("xml:lang");
        String lang = null;
        String xmlLang = null;
        String tag = element.getStartTag().toString().trim();
        if (langAttr != null) {
            lang = langAttr.trim();
            if (!lang.isEmpty()) {
                allLangAttributes.add(lang);
                allLangAttributeTags.add(tag);
            }
        }
        if (xmlLangAttr != null) {
            xmlLang = xmlLangAttr.trim();
            if (!xmlLang.isEmpty()) {
                allXmlLangAttributes.add(xmlLang);
                allXmlLangAttributeTags.add(tag);
            }
        }
        if (lang != null && xmlLang != null && !lang.equals(xmlLang)) {
            this.allConflictingLangAttributes.add(Arrays.asList(lang, xmlLang, tag));
        }
    }

    // Find all values of dir attributes.
    this.allDirAttributes = new TreeSet<>();
    for (Element element : source.getAllElements()) {
        if (element.getAttributeValue("dir") != null) {
            allDirAttributes.add(element.getAttributeValue("dir"));
        }
    }
}

From source file:org.jets3t.service.utils.FileComparer.java

/**
 * Normalize string into "Normalization Form Canonical Decomposition" (NFD).
 *
 * References://ww  w  .  jav  a 2 s.c o  m
 * http://stackoverflow.com/questions/3610013
 * http://en.wikipedia.org/wiki/Unicode_equivalence
 *
 * @param str
 * @return string normalized into NFC form.
 */
protected String normalizeUnicode(String str) {
    Normalizer.Form form = Normalizer.Form.NFD;
    if (!Normalizer.isNormalized(str, form)) {
        return Normalizer.normalize(str, form);
    }
    return str;
}

From source file:org.opensextant.util.TextUtils.java

/**
 * Normalize to "Normalization Form Canonical Decomposition" (NFD) REF:
 * http://from  w  ww  .  j a v a 2 s. c o m
 * //stackoverflow.com/questions/3610013/file-listfiles-mangles-unicode-
 * names-with-jdk-6-unicode-normalization-issues This supports proper file
 * name retrieval from file system, among other things. In many situations
 * we see unicode file names -- Java can list them, but in using the
 * Java-provided version of the filename the OS/FS may not be able to find
 * the file by the name given in a particular normalized form.
 *
 * @param str
 *            text
 * @return normalized string, encoded with NFD bytes
 */
public static String normalizeUnicode(String str) {
    Normalizer.Form form = Normalizer.Form.NFD;
    if (!Normalizer.isNormalized(str, form)) {
        return Normalizer.normalize(str, form);
    }
    return str;
}

From source file:info.ajaxplorer.synchro.SyncJob.java

protected String normalizeUnicode(String str) {
    Normalizer.Form form = Normalizer.Form.NFD;
    if (!Normalizer.isNormalized(str, form)) {
        return Normalizer.normalize(str, form);
    }/* www.  j  a va2  s  .  c  om*/
    return str;
}

From source file:de.innovationgate.utils.WGUtils.java

/**
 * performs a unicode normalization to NFC form (java.text.Normalizer.Form.NFC) for the given input
 * @param input The input string/*from  w  ww  .  j a va 2 s .  com*/
 * @return the normalized or original value if already NFC form
 */
public static String normalizeUnicode(String input) {
    if (input != null && !Normalizer.isNormalized(input, Normalizer.Form.NFC)) {
        return Normalizer.normalize(input, Normalizer.Form.NFC);
    }
    return input;
}