Determining validity of characters outside basic 7-bit range of Unicode, for XML 1.0 : XMLEncoder « XML « Java

Determining validity of characters outside basic 7-bit range of Unicode, for XML 1.0

// Revised from ctc wstx

 * Simple utility class that encapsulates logic of determining validity
 * of characters outside basic 7-bit range of Unicode, for XML 1.0
public final class XmlChars
    /* We don't need full 64k bits... (0x80 - 0x312C) / 32. But to
     * simplify things, let's just include first 0x80 entries in there etc
    final static int SIZE = (0x3140 >> 5); // 32 bits per int

    final static int[] sXml10StartChars = new int[SIZE];
    static {
        SETBITS(sXml10StartChars, 0xC0, 0xD6);
        SETBITS(sXml10StartChars, 0xD8, 0xF6);
        SETBITS(sXml10StartChars, 0xF8, 0xFF);
        SETBITS(sXml10StartChars, 0x100, 0x131);
        SETBITS(sXml10StartChars, 0x134, 0x13e);
        SETBITS(sXml10StartChars, 0x141, 0x148);
        SETBITS(sXml10StartChars, 0x14a, 0x17e);
        SETBITS(sXml10StartChars, 0x180, 0x1c3);
        SETBITS(sXml10StartChars, 0x1cd, 0x1f0);
        SETBITS(sXml10StartChars, 0x1f4, 0x1f5);
        SETBITS(sXml10StartChars, 0x1fa, 0x217);
        SETBITS(sXml10StartChars, 0x250, 0x2a8);
        SETBITS(sXml10StartChars, 0x2bb, 0x2c1);
        SETBITS(sXml10StartChars, 0x386);
        SETBITS(sXml10StartChars, 0x388, 0x38a);
        SETBITS(sXml10StartChars, 0x38c);
        SETBITS(sXml10StartChars, 0x38e, 0x3a1);
        SETBITS(sXml10StartChars, 0x3a3, 0x3ce);
        SETBITS(sXml10StartChars, 0x3d0, 0x3d6);
        SETBITS(sXml10StartChars, 0x3da);
        SETBITS(sXml10StartChars, 0x3dc);
        SETBITS(sXml10StartChars, 0x3de);
        SETBITS(sXml10StartChars, 0x3e0);
        SETBITS(sXml10StartChars, 0x3e2, 0x3f3);
        SETBITS(sXml10StartChars, 0x401, 0x40c);
        SETBITS(sXml10StartChars, 0x40e, 0x44f);
        SETBITS(sXml10StartChars, 0x451, 0x45c);
        SETBITS(sXml10StartChars, 0x45e, 0x481);
        SETBITS(sXml10StartChars, 0x490, 0x4c4);
        SETBITS(sXml10StartChars, 0x4c7, 0x4c8);
        SETBITS(sXml10StartChars, 0x4cb, 0x4cc);
        SETBITS(sXml10StartChars, 0x4d0, 0x4eb);
        SETBITS(sXml10StartChars, 0x4ee, 0x4f5);
        SETBITS(sXml10StartChars, 0x4f8, 0x4f9);

        SETBITS(sXml10StartChars, 0x531, 0x556);
        SETBITS(sXml10StartChars, 0x559);
        SETBITS(sXml10StartChars, 0x561, 0x586);
        SETBITS(sXml10StartChars, 0x5d0, 0x5ea);
        SETBITS(sXml10StartChars, 0x5f0, 0x5f2);
        SETBITS(sXml10StartChars, 0x621, 0x63a);
        SETBITS(sXml10StartChars, 0x641, 0x64a);
        SETBITS(sXml10StartChars, 0x671, 0x6b7);
        SETBITS(sXml10StartChars, 0x6ba, 0x6be);
        SETBITS(sXml10StartChars, 0x6c0, 0x6ce);
        SETBITS(sXml10StartChars, 0x6d0, 0x6d3);
        SETBITS(sXml10StartChars, 0x6d5);

        SETBITS(sXml10StartChars, 0x6e5, 0x6e6);
        SETBITS(sXml10StartChars, 0x905, 0x939);
        SETBITS(sXml10StartChars, 0x93d);
        SETBITS(sXml10StartChars, 0x958, 0x961);
        SETBITS(sXml10StartChars, 0x985, 0x98c);
        SETBITS(sXml10StartChars, 0x98f, 0x990);
        SETBITS(sXml10StartChars, 0x993, 0x9a8);
        SETBITS(sXml10StartChars, 0x9aa, 0x9b0);
        SETBITS(sXml10StartChars, 0x9b2);
        SETBITS(sXml10StartChars, 0x9b6, 0x9b9);
        SETBITS(sXml10StartChars, 0x9dc);
        SETBITS(sXml10StartChars, 0x9dd);
        SETBITS(sXml10StartChars, 0x9df, 0x9e1);
        SETBITS(sXml10StartChars, 0x9f0); SETBITS(sXml10StartChars, 0x9f1);
        SETBITS(sXml10StartChars, 0xA05, 0xA0A);
        SETBITS(sXml10StartChars, 0xA0F); SETBITS(sXml10StartChars, 0xA10);
        SETBITS(sXml10StartChars, 0xA13, 0xA28);
        SETBITS(sXml10StartChars, 0xA2A, 0xA30);
        SETBITS(sXml10StartChars, 0xA32); SETBITS(sXml10StartChars, 0xA33);
        SETBITS(sXml10StartChars, 0xA35); SETBITS(sXml10StartChars, 0xA36);
        SETBITS(sXml10StartChars, 0xA38); SETBITS(sXml10StartChars, 0xA39);
        SETBITS(sXml10StartChars, 0xA59, 0xA5C);
        SETBITS(sXml10StartChars, 0xA5E);
        SETBITS(sXml10StartChars, 0xA72, 0xA74);
        SETBITS(sXml10StartChars, 0xA85, 0xA8B);
        SETBITS(sXml10StartChars, 0xA8D);
        SETBITS(sXml10StartChars, 0xA8F, 0xA91);
        SETBITS(sXml10StartChars, 0xA93, 0xAA8);
        SETBITS(sXml10StartChars, 0xAAA, 0xAB0);
        SETBITS(sXml10StartChars, 0xAB2, 0xAB3);
        SETBITS(sXml10StartChars, 0xAB5, 0xAB9);
        SETBITS(sXml10StartChars, 0xABD);
        SETBITS(sXml10StartChars, 0xAE0);
        SETBITS(sXml10StartChars, 0xB05, 0xB0C);
        SETBITS(sXml10StartChars, 0xB0F); SETBITS(sXml10StartChars, 0xB10);
        SETBITS(sXml10StartChars, 0xB13, 0xB28);

        SETBITS(sXml10StartChars, 0xB2A, 0xB30);
        SETBITS(sXml10StartChars, 0xB32); SETBITS(sXml10StartChars, 0xB33);
        SETBITS(sXml10StartChars, 0xB36, 0xB39);
        SETBITS(sXml10StartChars, 0xB3D);
        SETBITS(sXml10StartChars, 0xB5C); SETBITS(sXml10StartChars, 0xB5D);
        SETBITS(sXml10StartChars, 0xB5F, 0xB61);
        SETBITS(sXml10StartChars, 0xB85, 0xB8A);
        SETBITS(sXml10StartChars, 0xB8E, 0xB90);

        SETBITS(sXml10StartChars, 0xB92, 0xB95);
        SETBITS(sXml10StartChars, 0xB99, 0xB9A);
        SETBITS(sXml10StartChars, 0xB9C);
        SETBITS(sXml10StartChars, 0xB9E); SETBITS(sXml10StartChars, 0xB9F);
        SETBITS(sXml10StartChars, 0xBA3); SETBITS(sXml10StartChars, 0xBA4);
        SETBITS(sXml10StartChars, 0xBA8, 0xBAA);
        SETBITS(sXml10StartChars, 0xBAE, 0xBB5);
        SETBITS(sXml10StartChars, 0xBB7, 0xBB9);
        SETBITS(sXml10StartChars, 0xC05, 0xC0C);
        SETBITS(sXml10StartChars, 0xC0E, 0xC10);

        SETBITS(sXml10StartChars, 0xC12, 0xC28);
        SETBITS(sXml10StartChars, 0xC2A, 0xC33);
        SETBITS(sXml10StartChars, 0xC35, 0xC39);
        SETBITS(sXml10StartChars, 0xC60); SETBITS(sXml10StartChars, 0xC61);
        SETBITS(sXml10StartChars, 0xC85, 0xC8C);
        SETBITS(sXml10StartChars, 0xC8E, 0xC90);
        SETBITS(sXml10StartChars, 0xC92, 0xCA8);
        SETBITS(sXml10StartChars, 0xCAA, 0xCB3);
        SETBITS(sXml10StartChars, 0xCB5, 0xCB9);
        SETBITS(sXml10StartChars, 0xCDE);
        SETBITS(sXml10StartChars, 0xCE0); SETBITS(sXml10StartChars, 0xCE1);
        SETBITS(sXml10StartChars, 0xD05, 0xD0C);
        SETBITS(sXml10StartChars, 0xD0E, 0xD10);
        SETBITS(sXml10StartChars, 0xD12, 0xD28);
        SETBITS(sXml10StartChars, 0xD2A, 0xD39);
        SETBITS(sXml10StartChars, 0xD60); SETBITS(sXml10StartChars, 0xD61);
        SETBITS(sXml10StartChars, 0xE01, 0xE2E);
        SETBITS(sXml10StartChars, 0xE30);
        SETBITS(sXml10StartChars, 0xE32); SETBITS(sXml10StartChars, 0xE33);
        SETBITS(sXml10StartChars, 0xE40, 0xE45);
        SETBITS(sXml10StartChars, 0xE81); SETBITS(sXml10StartChars, 0xE82);
        SETBITS(sXml10StartChars, 0xE84);
        SETBITS(sXml10StartChars, 0xE87); SETBITS(sXml10StartChars, 0xE88);
        SETBITS(sXml10StartChars, 0xE8A); SETBITS(sXml10StartChars, 0xE8D);
        SETBITS(sXml10StartChars, 0xE94, 0xE97);
        SETBITS(sXml10StartChars, 0xE99, 0xE9F);
        SETBITS(sXml10StartChars, 0xEA1, 0xEA3);
        SETBITS(sXml10StartChars, 0xEA5); SETBITS(sXml10StartChars, 0xEA7);
        SETBITS(sXml10StartChars, 0xEAA); SETBITS(sXml10StartChars, 0xEAB);
        SETBITS(sXml10StartChars, 0xEAD); SETBITS(sXml10StartChars, 0xEAE);
        SETBITS(sXml10StartChars, 0xEB0);
        SETBITS(sXml10StartChars, 0xEB2); SETBITS(sXml10StartChars, 0xEB3);
        SETBITS(sXml10StartChars, 0xEBD);

        SETBITS(sXml10StartChars, 0xEC0, 0xEC4);
        SETBITS(sXml10StartChars, 0xF40, 0xF47);
        SETBITS(sXml10StartChars, 0xF49, 0xF69);
        SETBITS(sXml10StartChars, 0x10a0, 0x10c5);
        SETBITS(sXml10StartChars, 0x10d0, 0x10f6);
        SETBITS(sXml10StartChars, 0x1100);
        SETBITS(sXml10StartChars, 0x1102, 0x1103);
        SETBITS(sXml10StartChars, 0x1105, 0x1107);
        SETBITS(sXml10StartChars, 0x1109);
        SETBITS(sXml10StartChars, 0x110b, 0x110c);
        SETBITS(sXml10StartChars, 0x110e, 0x1112);
        SETBITS(sXml10StartChars, 0x113c);
        SETBITS(sXml10StartChars, 0x113e);
        SETBITS(sXml10StartChars, 0x1140);
        SETBITS(sXml10StartChars, 0x114c);
        SETBITS(sXml10StartChars, 0x114e);
        SETBITS(sXml10StartChars, 0x1150);
        SETBITS(sXml10StartChars, 0x1154, 0x1155);
        SETBITS(sXml10StartChars, 0x1159);
        SETBITS(sXml10StartChars, 0x115f, 0x1161);
        SETBITS(sXml10StartChars, 0x1163);
        SETBITS(sXml10StartChars, 0x1165);
        SETBITS(sXml10StartChars, 0x1167);
        SETBITS(sXml10StartChars, 0x1169);
        SETBITS(sXml10StartChars, 0x116d, 0x116e);
        SETBITS(sXml10StartChars, 0x1172, 0x1173);
        SETBITS(sXml10StartChars, 0x1175);
        SETBITS(sXml10StartChars, 0x119e);
        SETBITS(sXml10StartChars, 0x11a8);
        SETBITS(sXml10StartChars, 0x11ab);
        SETBITS(sXml10StartChars, 0x11ae, 0x11af);
        SETBITS(sXml10StartChars, 0x11b7, 0x11b8);
        SETBITS(sXml10StartChars, 0x11ba);
        SETBITS(sXml10StartChars, 0x11bc, 0x11c2);
        SETBITS(sXml10StartChars, 0x11eb);
        SETBITS(sXml10StartChars, 0x11f0);
        SETBITS(sXml10StartChars, 0x11f9);
        SETBITS(sXml10StartChars, 0x1e00, 0x1e9b);
        SETBITS(sXml10StartChars, 0x1ea0, 0x1ef9);
        SETBITS(sXml10StartChars, 0x1f00, 0x1f15);
        SETBITS(sXml10StartChars, 0x1f18, 0x1f1d);
        SETBITS(sXml10StartChars, 0x1f20, 0x1f45);
        SETBITS(sXml10StartChars, 0x1f48, 0x1f4d);
        SETBITS(sXml10StartChars, 0x1f50, 0x1f57);
        SETBITS(sXml10StartChars, 0x1f59);
        SETBITS(sXml10StartChars, 0x1f5b);
        SETBITS(sXml10StartChars, 0x1f5d);
        SETBITS(sXml10StartChars, 0x1f5f, 0x1f7d);
        SETBITS(sXml10StartChars, 0x1f80, 0x1fb4);
        SETBITS(sXml10StartChars, 0x1fb6, 0x1fbc);
        SETBITS(sXml10StartChars, 0x1fbe);
        SETBITS(sXml10StartChars, 0x1fc2, 0x1fc4);
        SETBITS(sXml10StartChars, 0x1fc6, 0x1fcc);
        SETBITS(sXml10StartChars, 0x1fd0, 0x1fd3);
        SETBITS(sXml10StartChars, 0x1fd6, 0x1fdb);
        SETBITS(sXml10StartChars, 0x1fe0, 0x1fec);
        SETBITS(sXml10StartChars, 0x1ff2, 0x1ff4);
        SETBITS(sXml10StartChars, 0x1ff6, 0x1ffc);
        SETBITS(sXml10StartChars, 0x2126);
        SETBITS(sXml10StartChars, 0x212a, 0x212b);
        SETBITS(sXml10StartChars, 0x212e);
        SETBITS(sXml10StartChars, 0x2180, 0x2182);
        SETBITS(sXml10StartChars, 0x3041, 0x3094);
        SETBITS(sXml10StartChars, 0x30a1, 0x30fa);
        SETBITS(sXml10StartChars, 0x3105, 0x312c);
        // note: AC00 - D7A3 handled separately

        // [86] Ideographic (but note: > 0x312c handled separately)
        SETBITS(sXml10StartChars, 0x3007);
        SETBITS(sXml10StartChars, 0x3021, 0x3029);

    final static int[] sXml10Chars = new int[SIZE];
    static {
        // Let's start with all valid start chars:
        System.arraycopy(sXml10StartChars, 0, sXml10Chars, 0, SIZE);

        // [87] CombiningChar    ::=
        SETBITS(sXml10Chars, 0x300, 0x345);
        SETBITS(sXml10Chars, 0x360, 0x361);
        SETBITS(sXml10Chars, 0x483, 0x486);
        SETBITS(sXml10Chars, 0x591, 0x5a1);
        SETBITS(sXml10Chars, 0x5a3, 0x5b9);
        SETBITS(sXml10Chars, 0x5bb, 0x5bd);
        SETBITS(sXml10Chars, 0x5bf);

        SETBITS(sXml10Chars, 0x5c1, 0x5c2);
        SETBITS(sXml10Chars, 0x5c4);
        SETBITS(sXml10Chars, 0x64b, 0x652);
        SETBITS(sXml10Chars, 0x670);
        SETBITS(sXml10Chars, 0x6d6, 0x6dc);
        SETBITS(sXml10Chars, 0x6dd, 0x6df);
        SETBITS(sXml10Chars, 0x6e0, 0x6e4);
        SETBITS(sXml10Chars, 0x6e7, 0x6e8);
        SETBITS(sXml10Chars, 0x6ea, 0x6ed);

        SETBITS(sXml10Chars, 0x901, 0x903);
        SETBITS(sXml10Chars, 0x93c);
        SETBITS(sXml10Chars, 0x93e, 0x94c);
        SETBITS(sXml10Chars, 0x94d);
        SETBITS(sXml10Chars, 0x951, 0x954);
        SETBITS(sXml10Chars, 0x962); SETBITS(sXml10Chars, 0x963);
        SETBITS(sXml10Chars, 0x981, 0x983);
        SETBITS(sXml10Chars, 0x9bc);
        SETBITS(sXml10Chars, 0x9be); SETBITS(sXml10Chars, 0x9bf);
        SETBITS(sXml10Chars, 0x9c0, 0x9c4);
        SETBITS(sXml10Chars, 0x9c7); SETBITS(sXml10Chars, 0x9c8);
        SETBITS(sXml10Chars, 0x9cb, 0x9cd);
        SETBITS(sXml10Chars, 0x9d7);
        SETBITS(sXml10Chars, 0x9e2); SETBITS(sXml10Chars, 0x9e3);
        SETBITS(sXml10Chars, 0xA02);
        SETBITS(sXml10Chars, 0xA3C);
        SETBITS(sXml10Chars, 0xA3E); SETBITS(sXml10Chars, 0xA3F);
        SETBITS(sXml10Chars, 0xA40, 0xA42);
        SETBITS(sXml10Chars, 0xA47); SETBITS(sXml10Chars, 0xA48);
        SETBITS(sXml10Chars, 0xA4B, 0xA4D);
        SETBITS(sXml10Chars, 0xA70); SETBITS(sXml10Chars, 0xA71);
        SETBITS(sXml10Chars, 0xA81, 0xA83);
        SETBITS(sXml10Chars, 0xABC);
        SETBITS(sXml10Chars, 0xABE, 0xAC5);
        SETBITS(sXml10Chars, 0xAC7, 0xAC9);
        SETBITS(sXml10Chars, 0xACB, 0xACD);
        SETBITS(sXml10Chars, 0xB01, 0xB03);
        SETBITS(sXml10Chars, 0xB3C);
        SETBITS(sXml10Chars, 0xB3E, 0xB43);
        SETBITS(sXml10Chars, 0xB47); SETBITS(sXml10Chars, 0xB48);
        SETBITS(sXml10Chars, 0xB4B, 0xB4D);
        SETBITS(sXml10Chars, 0xB56); SETBITS(sXml10Chars, 0xB57);
        SETBITS(sXml10Chars, 0xB82); SETBITS(sXml10Chars, 0xB83);
        SETBITS(sXml10Chars, 0xBBE, 0xBC2);
        SETBITS(sXml10Chars, 0xBC6, 0xBC8);
        SETBITS(sXml10Chars, 0xBCA, 0xBCD);
        SETBITS(sXml10Chars, 0xBD7);
        SETBITS(sXml10Chars, 0xC01, 0xC03);
        SETBITS(sXml10Chars, 0xC3E, 0xC44);
        SETBITS(sXml10Chars, 0xC46, 0xC48);
        SETBITS(sXml10Chars, 0xC4A, 0xC4D);
        SETBITS(sXml10Chars, 0xC55, 0xC56);
        SETBITS(sXml10Chars, 0xC82, 0xC83);
        SETBITS(sXml10Chars, 0xCBE, 0xCC4);
        SETBITS(sXml10Chars, 0xCC6, 0xCC8);
        SETBITS(sXml10Chars, 0xCCA, 0xCCD);
        SETBITS(sXml10Chars, 0xCD5, 0xCD6);
        SETBITS(sXml10Chars, 0xD02, 0xD03);
        SETBITS(sXml10Chars, 0xD3E, 0xD43);
        SETBITS(sXml10Chars, 0xD46, 0xD48);
        SETBITS(sXml10Chars, 0xD4A, 0xD4D);
        SETBITS(sXml10Chars, 0xD57);
        SETBITS(sXml10Chars, 0xE31);
        SETBITS(sXml10Chars, 0xE34, 0xE3A);
        SETBITS(sXml10Chars, 0xE47, 0xE4E);
        SETBITS(sXml10Chars, 0xEB1);
        SETBITS(sXml10Chars, 0xEB4, 0xEB9);
        SETBITS(sXml10Chars, 0xEBB, 0xEBC);
        SETBITS(sXml10Chars, 0xEC8, 0xECD);
        SETBITS(sXml10Chars, 0xF18, 0xF19);
        SETBITS(sXml10Chars, 0xF35); SETBITS(sXml10Chars, 0xF37);
        SETBITS(sXml10Chars, 0xF39);
        SETBITS(sXml10Chars, 0xF3E); SETBITS(sXml10Chars, 0xF3F);
        SETBITS(sXml10Chars, 0xF71, 0xF84);
        SETBITS(sXml10Chars, 0xF86, 0xF8B);
        SETBITS(sXml10Chars, 0xF90, 0xF95);
        SETBITS(sXml10Chars, 0xF97);
        SETBITS(sXml10Chars, 0xF99, 0xFAD);
        SETBITS(sXml10Chars, 0xFB1, 0xFB7);
        SETBITS(sXml10Chars, 0xFB9);
        SETBITS(sXml10Chars, 0x20D0, 0x20DC);
        SETBITS(sXml10Chars, 0x20E1);
        SETBITS(sXml10Chars, 0x302A, 0x302F);
        SETBITS(sXml10Chars, 0x3099); SETBITS(sXml10Chars, 0x309A);
        // [88] Digit:
        SETBITS(sXml10Chars, 0x660, 0x669);
        SETBITS(sXml10Chars, 0x6f0, 0x6f9);
        SETBITS(sXml10Chars, 0x966, 0x96f);
        SETBITS(sXml10Chars, 0x9e6, 0x9ef);
        SETBITS(sXml10Chars, 0xa66, 0xa6f);
        SETBITS(sXml10Chars, 0xae6, 0xaef);
        SETBITS(sXml10Chars, 0xb66, 0xb6f);
        SETBITS(sXml10Chars, 0xbe7, 0xbef);
        SETBITS(sXml10Chars, 0xc66, 0xc6f);
        SETBITS(sXml10Chars, 0xce6, 0xcef);
        SETBITS(sXml10Chars, 0xd66, 0xd6f);
        SETBITS(sXml10Chars, 0xe50, 0xe59);
        SETBITS(sXml10Chars, 0xed0, 0xed9);
        SETBITS(sXml10Chars, 0xf20, 0xf29);
        // [89] Extender:
        SETBITS(sXml10Chars, 0xb7);
        SETBITS(sXml10Chars, 0x2d0);
        SETBITS(sXml10Chars, 0x2d1);
        SETBITS(sXml10Chars, 0x387);
        SETBITS(sXml10Chars, 0x640);
        SETBITS(sXml10Chars, 0xE46);
        SETBITS(sXml10Chars, 0xEC6);
        SETBITS(sXml10Chars, 0x3005);
        SETBITS(sXml10Chars, 0x3031, 0x3035);
        SETBITS(sXml10Chars, 0x309d, 0x309e);
        SETBITS(sXml10Chars, 0x30fc, 0x30fe);
    private XmlChars() { }

    public final static boolean is10NameStartChar(char c)
        // First, let's deal with outliers
        if (c > 0x312C) { // Most valid chars are below this..
            if (c < 0xAC00) {
                return (c >= 0x4E00 && c <= 0x9FA5); // valid ideograms
            if (c <= 0xD7A3) { // 0xAC00 - 0xD7A3, valid base chars
                return true;
            /* As to surrogate pairs... let's do the bare minimum;
             * 0xD800 - 0xDBFF (high surrogate) are ok; low surrogates
             * can only follow high one
            return (c <= 0xDBFF && c >= 0xD800);
        // but then we'll just need to use the table...
        int ix = (int) c;
        return (sXml10StartChars[ix >> 5] & (1 << (ix & 31))) != 0;

    public final static boolean is10NameChar(char c)
        // First, let's deal with outliers
        if (c > 0x312C) { // Most valid chars are below this..
            if (c < 0xAC00) {
                return (c >= 0x4E00 && c <= 0x9FA5); // valid ideograms
            if (c <= 0xD7A3) { // 0xAC00 - 0xD7A3, valid base chars
                return true;
            /* As to surrogate pairs... let's do the bare minimum;
             * 0xD800 - 0xDFFF (high, low surrogate) are ok (need to
             * check pairing in future)
            return (c >= 0xD800 && c <= 0xDFFF);
        // but then we'll just need to use the table...
        int ix = (int) c;
        return (sXml10Chars[ix >> 5] & (1 << (ix & 31))) != 0;

    public final static boolean is11NameStartChar(char c)
        // Others are checked block-by-block:
        if (c <= 0x2FEF) {
            if (c < 0x300) {
                if (c < 0x00C0) { // 8-bit ctrl chars
                    return false;
                // most of the rest are fine...
                return (c != 0xD7 && c != 0xF7);
            if (c >= 0x2C00) {
                // 0x2C00 - 0x2FEF are ok
                return true;
            if (c < 0x370 || c > 0x218F) {
                // 0x300 - 0x36F, 0x2190 - 0x2BFF invalid
                return false;
            if (c < 0x2000) {
                // 0x370 - 0x37D, 0x37F - 0x1FFF are ok
                return (c != 0x37E);
            if (c >= 0x2070) {
                // 0x2070 - 0x218F are ok
                return (c <= 0x218F);
            // And finally, 0x200C - 0x200D
            return (c == 0x200C || c == 0x200D);

        // 0x3000 and above:
        if (c >= 0x3001) {
            /* Hmmh, let's allow high surrogates here, without checking
             * that they are properly followed... crude basic support,
             * I know, but allows valid combinations, just doesn't catch
             * invalid ones
            if (c <= 0xDBFF) { // 0x3001 - 0xD7FF (chars),
                // 0xD800 - 0xDBFF (high surrogate) are ok (unlike DC00-DFFF)
                return true;
            if (c >= 0xF900 && c <= 0xFFFD) {
                /* Check above removes low surrogate (since one can not
                 * START an identifier), and byte-order markers..
                return (c <= 0xFDCF || c >= 0xFDF0);

        return false;

    public final static boolean is11NameChar(char c)
        // Others are checked block-by-block:
        if (c <= 0x2FEF) {
            if (c < 0x2000) { // only 8-bit ctrl chars and 0x37E to filter out
                return (c >= 0x00C0 && c != 0x37E) || (c == 0xB7);
            if (c >= 0x2C00) {
                // 0x100 - 0x1FFF, 0x2C00 - 0x2FEF are ok
                return true;
            if (c < 0x200C || c > 0x218F) {
                // 0x2000 - 0x200B, 0x2190 - 0x2BFF invalid
                return false;
            if (c >= 0x2070) {
                // 0x2070 - 0x218F are ok
                return true;
            // And finally, 0x200C - 0x200D, 0x203F - 0x2040 are ok
            return (c == 0x200C || c == 0x200D
                || c == 0x203F || c == 0x2040);

        // 0x3000 and above:
        if (c >= 0x3001) {
            /* Hmmh, let's allow surrogate heres, without checking that
             * they have proper ordering. For non-first name chars, both are
             * ok, for valid names. Crude basic support,
             * I know, but allows valid combinations, just doesn't catch
             * invalid ones
            if (c <= 0xDFFF) { // 0x3001 - 0xD7FF (chars),
                // 0xD800 - 0xDFFF (high, low surrogate) are ok:
                return true;
            if (c >= 0xF900 && c <= 0xFFFD) {
                /* Check above removes other invalid chars (below valid
                 * range), and byte-order markers (0xFFFE, 0xFFFF).
                return (c <= 0xFDCF || c >= 0xFDF0);

        return false;

    private static void SETBITS(int[] array, int start, int end)
        int bit1 = (start & 31);
        int bit2 = (end & 31);
        start >>= 5;
        end >>= 5;

        /* Ok; this is not perfectly optimal, but should be good enough...
         * we'll only do one-by-one at the ends.
        if (start == end) {
            for (; bit1 <= bit2; ++bit1) {
                array[start] |= (1 << bit1);
        } else {
            for (int bit = bit1; bit <= 31; ++bit) {
                array[start] |= (1 << bit);
            while (++start < end) {
                array[start] = -1;
            for (int bit = 0; bit <= bit2; ++bit) {
                array[end] |= (1 << bit);

    private static void SETBITS(int[] array, int point) {
        int ix = (point >> 5);
        int bit = (point & 31);

        array[ix] |= (1 << bit);


Related examples in the same category

1.XMLEncoder a bean
2.Returns true if the argument, a UCS-4 character code, is valid in XML documents.
3.XML-related tasks and Readers: IETF standard encoding names, automatic detection of most XML encodings
4.Xml Encoding Sniffer
5.Returns true if the character is an XML "letter"
6.XML character properties
7.Encode Xml Attribute
8.Escape / unescape special chars according XML specifications
9.Verify whether the specified character conforms to the XML 1.0 definition of whitespace
10.Returns true if the character is a non-initial character in names according to the XML recommendation
11.Provides HTML and XML entity utilities.