Javascript String normalize()

Introduction

Some Unicode characters can be encoded in more than one way.

A character can be represented by either a single Basic Multilingual Plane(BMP) character or a surrogate pair.

Javascript String normalize() returns a string containing the Unicode Normalization Form of the given string.

str.normalize([form])
  • form - Optional, One of "NFC", "NFD", "NFKC", or "NFKD".

The form specifies the Unicode Normalization Form.

It defaults to "NFC".

These values have the following meanings:

FormMeaning
NFC Canonical Decomposition, followed by Canonical Composition.
NFDCanonical Decomposition.
NFKCCompatibility Decomposition, followed by Canonical Composition.
NFKD Compatibility Decomposition.

For example, consider the following:

U+00C5 is Latin capital letter A with ring above

console.log(String.fromCharCode(0x00C5));

U+212B is the same letter.

console.log(String.fromCharCode(0x212B));         

U+0041 and U+030A combined together is the same letter.

console.log(String.fromCharCode(0x0041, 0x030A));  

let a1 = String.fromCharCode(0x00C5),
    a2 = String.fromCharCode(0x212B),
    a3 = String.fromCharCode(0x0041, 0x030A);

console.log(a1, a2, a3);  /*from  w  w  w.  j a  v a2  s  .c om*/

console.log(a1 === a2); // false 
console.log(a1 === a3); // false 
console.log(a2 === a3); // false 

We can determine if a string is already normalized by checking it against the return value of normalize():

let a1 = String.fromCharCode(0x00C5),
    a2 = String.fromCharCode(0x212B),
    a3 = String.fromCharCode(0x0041, 0x030A);

// U+00C5 is the NFC/NFKC normalized form of 0+212B 
console.log(a1 === a1.normalize("NFD")); // false 
console.log(a1 === a1.normalize("NFC")); // true 
console.log(a1 === a1.normalize("NFKD")); // false 
console.log(a1 === a1.normalize("NFKC")); // true 

// U+212B is non-normalized 
console.log(a2 === a2.normalize("NFD")); // false 
console.log(a2 === a2.normalize("NFC")); // false 
console.log(a2 === a2.normalize("NFKD")); // false 
console.log(a2 === a2.normalize("NFKC")); // false 

// U+0041/U+030A is the NFD/NFKD normalized form of 0+212B 
console.log(a3 === a3.normalize("NFD")); // true 
console.log(a3 === a3.normalize("NFC")); // false 
console.log(a3 === a3.normalize("NFKD")); // true 
console.log(a3 === a3.normalize("NFKC")); // false 

a1 = String.fromCharCode(0x00C5),
a2 = String.fromCharCode(0x212B),
a3 = String.fromCharCode(0x0041, 0x030A);

console.log(a1.normalize("NFD") === a2.normalize("NFD")); // true  
console.log(a2.normalize("NFKC") === a3.normalize("NFKC")); // true  
console.log(a1.normalize("NFC") === a3.normalize("NFC")); // true 

Another example

let string1 = '\u00F1';            
let string2 = '\u006E\u0303';      

console.log(string1 === string2); //  w w w . j a  v a 2  s . c o  m
console.log(string1.length);      
console.log(string2.length);     

//canonical equivalence normalization

string1 = string1.normalize('NFD');
string2 = string2.normalize('NFD');

console.log(string1 === string2); // true
console.log(string1.length);      // 2
console.log(string2.length);      // 2

//specify "NFC" to get the composed canonical form
//multiple code points are replaced with single code points where possible. 

string1 = string1.normalize('NFC');
string2 = string2.normalize('NFC');

console.log(string1 === string2);                 // true
console.log(string1.length);                      // 1
console.log(string2.length);                      // 1
console.log(string2.codePointAt(0).toString(16)); // f1

//Compatibility normalization

string1 = string1.normalize('NFKD');
string2 = string2.normalize('NFKD');

console.log(string1);             
console.log(string2);             
console.log(string1 === string2); 
console.log(string1.length);      
console.log(string2.length);      



PreviousNext

Related