From 8e640a5f32160fd786cc4722c8b46bbca7ba500a Mon Sep 17 00:00:00 2001 From: Arnaud Charlet Date: Wed, 10 Jun 2020 08:26:06 -0400 Subject: [PATCH] [Ada] AI12-0004 Normalization and allowed characters gcc/ada/ * scng.adb (Scan): Detect wide characters not in NFKC. * libgnat/a-chahan.adb, libgnat/a-chahan.ads, libgnat/a-wichha.adb, libgnat/a-wichha.ads, libgnat/a-wichun.adb, libgnat/a-wichun.ads, libgnat/a-zchhan.adb, libgnat/a-zchhan.ads, libgnat/a-zchuni.adb, libgnat/a-zchuni.ads (Is_NFKC): New. * libgnat/s-utf_32.ads, libgnat/s-utf_32.adb (Is_UTF_32_NFKC): New. --- gcc/ada/libgnat/a-chahan.adb | 11 + gcc/ada/libgnat/a-chahan.ads | 1 + gcc/ada/libgnat/a-wichha.adb | 7 + gcc/ada/libgnat/a-wichha.ads | 6 + gcc/ada/libgnat/a-wichun.adb | 9 + gcc/ada/libgnat/a-wichun.ads | 8 +- gcc/ada/libgnat/a-zchhan.adb | 7 + gcc/ada/libgnat/a-zchhan.ads | 6 + gcc/ada/libgnat/a-zchuni.adb | 9 + gcc/ada/libgnat/a-zchuni.ads | 6 + gcc/ada/libgnat/s-utf_32.adb | 411 ++++++++++++++++++++++++++++++++++- gcc/ada/libgnat/s-utf_32.ads | 6 + gcc/ada/scng.adb | 9 +- 13 files changed, 490 insertions(+), 6 deletions(-) diff --git a/gcc/ada/libgnat/a-chahan.adb b/gcc/ada/libgnat/a-chahan.adb index faee41bbc2a..de66846b3a7 100644 --- a/gcc/ada/libgnat/a-chahan.adb +++ b/gcc/ada/libgnat/a-chahan.adb @@ -399,6 +399,17 @@ package body Ada.Characters.Handling is return False; end Is_Mark; + ------------- + -- Is_NFKC -- + ------------- + + function Is_NFKC (Item : Character) return Boolean is + begin + return Character'Pos (Item) not in + 160 | 168 | 170 | 175 | 178 | 179 | 180 | 181 | 184 | 185 | 186 | + 188 | 189 | 190; + end Is_NFKC; + --------------------- -- Is_Other_Format -- --------------------- diff --git a/gcc/ada/libgnat/a-chahan.ads b/gcc/ada/libgnat/a-chahan.ads index 957d6236113..04f975c0432 100644 --- a/gcc/ada/libgnat/a-chahan.ads +++ b/gcc/ada/libgnat/a-chahan.ads @@ -58,6 +58,7 @@ package Ada.Characters.Handling is function Is_Other_Format (Item : Character) return Boolean; function Is_Punctuation_Connector (Item : Character) return Boolean; function Is_Space (Item : Character) return Boolean; + function Is_NFKC (Item : Character) return Boolean; --------------------------------------------------- -- Conversion Functions for Character and String -- diff --git a/gcc/ada/libgnat/a-wichha.adb b/gcc/ada/libgnat/a-wichha.adb index 7531ef67444..feccc233f78 100644 --- a/gcc/ada/libgnat/a-wichha.adb +++ b/gcc/ada/libgnat/a-wichha.adb @@ -124,6 +124,13 @@ package body Ada.Wide_Characters.Handling is function Is_Mark (Item : Wide_Character) return Boolean renames Ada.Wide_Characters.Unicode.Is_Mark; + ------------- + -- Is_NFKC -- + ------------- + + function Is_NFKC (Item : Wide_Character) return Boolean + renames Ada.Wide_Characters.Unicode.Is_NFKC; + --------------------- -- Is_Other_Format -- --------------------- diff --git a/gcc/ada/libgnat/a-wichha.ads b/gcc/ada/libgnat/a-wichha.ads index bb9452fb878..23eb468c3c1 100644 --- a/gcc/ada/libgnat/a-wichha.ads +++ b/gcc/ada/libgnat/a-wichha.ads @@ -101,6 +101,12 @@ package Ada.Wide_Characters.Handling is -- Returns True if the Wide_Character designated by Item is categorized as -- separator_space, otherwise returns False. + function Is_NFKC (Item : Wide_Character) return Boolean; + pragma Inline (Is_NFKC); + -- Returns True if the Wide_Character designated by Item could be present + -- in a string normalized to Normalization Form KC (as defined by Clause + -- 21 of ISO/IEC 10646:2017), otherwise returns False. + function Is_Graphic (Item : Wide_Character) return Boolean; pragma Inline (Is_Graphic); -- Returns True if the Wide_Character designated by Item is categorized as diff --git a/gcc/ada/libgnat/a-wichun.adb b/gcc/ada/libgnat/a-wichun.adb index cfd84da677e..09cbad2f89c 100644 --- a/gcc/ada/libgnat/a-wichun.adb +++ b/gcc/ada/libgnat/a-wichun.adb @@ -116,6 +116,15 @@ package body Ada.Wide_Characters.Unicode is return G.Is_UTF_32_Non_Graphic (G.Category (C)); end Is_Non_Graphic; + ------------- + -- Is_NFKC -- + ------------- + + function Is_NFKC (U : Wide_Character) return Boolean is + begin + return G.Is_UTF_32_NFKC (Wide_Character'Pos (U)); + end Is_NFKC; + -------------- -- Is_Other -- -------------- diff --git a/gcc/ada/libgnat/a-wichun.ads b/gcc/ada/libgnat/a-wichun.ads index c9eb938e5c9..9e427499ddd 100644 --- a/gcc/ada/libgnat/a-wichun.ads +++ b/gcc/ada/libgnat/a-wichun.ads @@ -131,7 +131,7 @@ package Ada.Wide_Characters.Unicode is pragma Inline (Is_Other); -- Returns true iff U is an other format character, which means that it -- can be used to extend an identifier, but is ignored for the purposes of - -- matching of identiers, or if C is one of the corresponding categories, + -- matching of identifiers, or if C is one of the corresponding categories, -- which are the following: -- Other, Format (Cf) @@ -150,6 +150,12 @@ package Ada.Wide_Characters.Unicode is -- of the corresponding categories, which are the following: -- Separator, Space (Zs) + function Is_NFKC (U : Wide_Character) return Boolean; + pragma Inline (Is_NFKC); + -- Returns True if the Wide_Character designated by U could be present + -- in a string normalized to Normalization Form KC (as defined by Clause + -- 21 of ISO/IEC 10646:2017), otherwise returns False. + function Is_Non_Graphic (U : Wide_Character) return Boolean; function Is_Non_Graphic (C : Category) return Boolean; pragma Inline (Is_Non_Graphic); diff --git a/gcc/ada/libgnat/a-zchhan.adb b/gcc/ada/libgnat/a-zchhan.adb index 4fd7eba8037..6930121ac1f 100644 --- a/gcc/ada/libgnat/a-zchhan.adb +++ b/gcc/ada/libgnat/a-zchhan.adb @@ -108,6 +108,13 @@ package body Ada.Wide_Wide_Characters.Handling is function Is_Mark (Item : Wide_Wide_Character) return Boolean renames Ada.Wide_Wide_Characters.Unicode.Is_Mark; + ------------- + -- Is_NFKC -- + ------------- + + function Is_NFKC (Item : Wide_Wide_Character) return Boolean + renames Ada.Wide_Wide_Characters.Unicode.Is_NFKC; + --------------------- -- Is_Other_Format -- --------------------- diff --git a/gcc/ada/libgnat/a-zchhan.ads b/gcc/ada/libgnat/a-zchhan.ads index 354452b49f5..74fab2abd65 100644 --- a/gcc/ada/libgnat/a-zchhan.ads +++ b/gcc/ada/libgnat/a-zchhan.ads @@ -98,6 +98,12 @@ package Ada.Wide_Wide_Characters.Handling is -- Returns True if the Wide_Wide_Character designated by Item is -- categorized as separator_space, otherwise returns false. + function Is_NFKC (Item : Wide_Wide_Character) return Boolean; + pragma Inline (Is_NFKC); + -- Returns True if the Wide_Wide_Character designated by Item could be + -- present in a string normalized to Normalization Form KC (as defined by + -- Clause 21 of ISO/IEC 10646:2017), otherwise returns False. + function Is_Graphic (Item : Wide_Wide_Character) return Boolean; pragma Inline (Is_Graphic); -- Returns True if the Wide_Wide_Character designated by Item is diff --git a/gcc/ada/libgnat/a-zchuni.adb b/gcc/ada/libgnat/a-zchuni.adb index b754af9b0d9..203c3aa25d8 100644 --- a/gcc/ada/libgnat/a-zchuni.adb +++ b/gcc/ada/libgnat/a-zchuni.adb @@ -107,6 +107,15 @@ package body Ada.Wide_Wide_Characters.Unicode is return G.Is_UTF_32_Non_Graphic (G.Category (C)); end Is_Non_Graphic; + ------------- + -- Is_NFKC -- + ------------- + + function Is_NFKC (U : Wide_Wide_Character) return Boolean is + begin + return G.Is_UTF_32_NFKC (Wide_Wide_Character'Pos (U)); + end Is_NFKC; + -------------- -- Is_Other -- -------------- diff --git a/gcc/ada/libgnat/a-zchuni.ads b/gcc/ada/libgnat/a-zchuni.ads index 162d18d9a3b..7f4a30ba28d 100644 --- a/gcc/ada/libgnat/a-zchuni.ads +++ b/gcc/ada/libgnat/a-zchuni.ads @@ -147,6 +147,12 @@ package Ada.Wide_Wide_Characters.Unicode is -- of the corresponding categories, which are the following: -- Separator, Space (Zs) + function Is_NFKC (U : Wide_Wide_Character) return Boolean; + pragma Inline (Is_NFKC); + -- Returns True if the Wide_Wide_Character designated by U could be present + -- in a string normalized to Normalization Form KC (as defined by Clause + -- 21 of ISO/IEC 10646:2017), otherwise returns False. + function Is_Non_Graphic (U : Wide_Wide_Character) return Boolean; function Is_Non_Graphic (C : Category) return Boolean; pragma Inline (Is_Non_Graphic); diff --git a/gcc/ada/libgnat/s-utf_32.adb b/gcc/ada/libgnat/s-utf_32.adb index a722d62e591..a1346f3bb3e 100644 --- a/gcc/ada/libgnat/s-utf_32.adb +++ b/gcc/ada/libgnat/s-utf_32.adb @@ -49,7 +49,7 @@ package body System.UTF_32 is ---------------------- -- Note these tables are derived from those given in AI-285. For details - -- see //www.ada-auth.org/cgi-bin/cvsweb.cgi/AIs/AI-00285.TXT?rev=1.22. + -- see www.ada-auth.org/cgi-bin/cvsweb.cgi/AIs/AI-00285.TXT?rev=1.22. type UTF_32_Range is record Lo : UTF_32; @@ -6071,9 +6071,6 @@ package body System.UTF_32 is 40, -- DESERET CAPITAL LETTER LONG I .. DESERET CAPITAL LETTER EW 32); -- TAG LATIN CAPITAL LETTER A .. TAG LATIN CAPITAL LETTER Z - pragma Warnings (On); - -- Temporary until pragma Warnings at start can be activated ??? - -- The following is a list of the 10646 names for CAPITAL LETTER entries -- that have no matching SMALL LETTER entry and are thus not folded @@ -6117,6 +6114,403 @@ package body System.UTF_32 is -- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI -- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + -- The following array includes all characters in the Unicode table with + -- the category NFKC_Quick_Check=No, taken from + -- www.unicode.org/Public/UCD/latest/ucd/DerivedNormalizationProps.txt + + UTF_32_NFKC_QC_No : constant UTF_32_Ranges := ( + (16#00A0#, 16#00A0#), -- NO-BREAK SPACE + (16#00A8#, 16#00A8#), -- DIAERESIS + (16#00AA#, 16#00AA#), -- FEMININE ORDINAL INDICATOR + (16#00AF#, 16#00AF#), -- MACRON + (16#00B2#, 16#00B3#), -- SUPERSCRIPT TWO..SUPERSCRIPT THREE + (16#00B4#, 16#00B4#), -- ACUTE ACCENT + (16#00B5#, 16#00B5#), -- MICRO SIGN + (16#00B8#, 16#00B8#), -- CEDILLA + (16#00B9#, 16#00B9#), -- SUPERSCRIPT ONE + (16#00BA#, 16#00BA#), -- MASCULINE ORDINAL INDICATOR + (16#00BC#, 16#00BE#), -- VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS + (16#0132#, 16#0133#), -- LATIN CAPITAL LIGATURE IJ..LATIN SMALL LIGATURE IJ + (16#013F#, 16#0140#), -- LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATIN SMALL LETTER L WITH MIDDLE DOT + (16#0149#, 16#0149#), -- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE + (16#017F#, 16#017F#), -- LATIN SMALL LETTER LONG S + (16#01C4#, 16#01CC#), -- LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER NJ + (16#01F1#, 16#01F3#), -- LATIN CAPITAL LETTER DZ..LATIN SMALL LETTER DZ + (16#02B0#, 16#02B8#), -- MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y + (16#02D8#, 16#02DD#), -- BREVE..DOUBLE ACUTE ACCENT + (16#02E0#, 16#02E4#), -- MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP + (16#0340#, 16#0341#), -- COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK + (16#0343#, 16#0344#), -- COMBINING GREEK KORONIS..COMBINING GREEK DIALYTIKA TONOS + (16#0374#, 16#0374#), -- GREEK NUMERAL SIGN + (16#037A#, 16#037A#), -- GREEK YPOGEGRAMMENI + (16#037E#, 16#037E#), -- GREEK QUESTION MARK + (16#0384#, 16#0385#), -- GREEK TONOS..GREEK DIALYTIKA TONOS + (16#0387#, 16#0387#), -- GREEK ANO TELEIA + (16#03D0#, 16#03D6#), -- GREEK BETA SYMBOL..GREEK PI SYMBOL + (16#03F0#, 16#03F2#), -- GREEK KAPPA SYMBOL..GREEK LUNATE SIGMA SYMBOL + (16#03F4#, 16#03F5#), -- GREEK CAPITAL THETA SYMBOL..GREEK LUNATE EPSILON SYMBOL + (16#03F9#, 16#03F9#), -- GREEK CAPITAL LUNATE SIGMA SYMBOL + (16#0587#, 16#0587#), -- ARMENIAN SMALL LIGATURE ECH YIWN + (16#0675#, 16#0678#), -- ARABIC LETTER HIGH HAMZA ALEF..ARABIC LETTER HIGH HAMZA YEH + (16#0958#, 16#095F#), -- DEVANAGARI LETTER QA..DEVANAGARI LETTER YYA + (16#09DC#, 16#09DD#), -- BENGALI LETTER RRA..BENGALI LETTER RHA + (16#09DF#, 16#09DF#), -- BENGALI LETTER YYA + (16#0A33#, 16#0A33#), -- GURMUKHI LETTER LLA + (16#0A36#, 16#0A36#), -- GURMUKHI LETTER SHA + (16#0A59#, 16#0A5B#), -- GURMUKHI LETTER KHHA..GURMUKHI LETTER ZA + (16#0A5E#, 16#0A5E#), -- GURMUKHI LETTER FA + (16#0B5C#, 16#0B5D#), -- ORIYA LETTER RRA..ORIYA LETTER RHA + (16#0E33#, 16#0E33#), -- THAI CHARACTER SARA AM + (16#0EB3#, 16#0EB3#), -- LAO VOWEL SIGN AM + (16#0EDC#, 16#0EDD#), -- LAO HO NO..LAO HO MO + (16#0F0C#, 16#0F0C#), -- TIBETAN MARK DELIMITER TSHEG BSTAR + (16#0F43#, 16#0F43#), -- TIBETAN LETTER GHA + (16#0F4D#, 16#0F4D#), -- TIBETAN LETTER DDHA + (16#0F52#, 16#0F52#), -- TIBETAN LETTER DHA + (16#0F57#, 16#0F57#), -- TIBETAN LETTER BHA + (16#0F5C#, 16#0F5C#), -- TIBETAN LETTER DZHA + (16#0F69#, 16#0F69#), -- TIBETAN LETTER KSSA + (16#0F73#, 16#0F73#), -- TIBETAN VOWEL SIGN II + (16#0F75#, 16#0F79#), -- TIBETAN VOWEL SIGN UU..TIBETAN VOWEL SIGN VOCALIC LL + (16#0F81#, 16#0F81#), -- TIBETAN VOWEL SIGN REVERSED II + (16#0F93#, 16#0F93#), -- TIBETAN SUBJOINED LETTER GHA + (16#0F9D#, 16#0F9D#), -- TIBETAN SUBJOINED LETTER DDHA + (16#0FA2#, 16#0FA2#), -- TIBETAN SUBJOINED LETTER DHA + (16#0FA7#, 16#0FA7#), -- TIBETAN SUBJOINED LETTER BHA + (16#0FAC#, 16#0FAC#), -- TIBETAN SUBJOINED LETTER DZHA + (16#0FB9#, 16#0FB9#), -- TIBETAN SUBJOINED LETTER KSSA + (16#10FC#, 16#10FC#), -- MODIFIER LETTER GEORGIAN NAR + (16#1D2C#, 16#1D2E#), -- MODIFIER LETTER CAPITAL A..MODIFIER LETTER CAPITAL B + (16#1D30#, 16#1D3A#), -- MODIFIER LETTER CAPITAL D..MODIFIER LETTER CAPITAL N + (16#1D3C#, 16#1D4D#), -- MODIFIER LETTER CAPITAL O..MODIFIER LETTER SMALL G + (16#1D4F#, 16#1D6A#), -- MODIFIER LETTER SMALL K..GREEK SUBSCRIPT SMALL LETTER CHI + (16#1D78#, 16#1D78#), -- MODIFIER LETTER CYRILLIC EN + (16#1D9B#, 16#1DBF#), -- MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA + (16#1E9A#, 16#1E9B#), -- LATIN SMALL LETTER A WITH RIGHT HALF RING..LATIN SMALL LETTER LONG S WITH DOT ABOVE + (16#1F71#, 16#1F71#), -- GREEK SMALL LETTER ALPHA WITH OXIA + (16#1F73#, 16#1F73#), -- GREEK SMALL LETTER EPSILON WITH OXIA + (16#1F75#, 16#1F75#), -- GREEK SMALL LETTER ETA WITH OXIA + (16#1F77#, 16#1F77#), -- GREEK SMALL LETTER IOTA WITH OXIA + (16#1F79#, 16#1F79#), -- GREEK SMALL LETTER OMICRON WITH OXIA + (16#1F7B#, 16#1F7B#), -- GREEK SMALL LETTER UPSILON WITH OXIA + (16#1F7D#, 16#1F7D#), -- GREEK SMALL LETTER OMEGA WITH OXIA + (16#1FBB#, 16#1FBB#), -- GREEK CAPITAL LETTER ALPHA WITH OXIA + (16#1FBD#, 16#1FBD#), -- GREEK KORONIS + (16#1FBE#, 16#1FBE#), -- GREEK PROSGEGRAMMENI + (16#1FBF#, 16#1FC1#), -- GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI + (16#1FC9#, 16#1FC9#), -- GREEK CAPITAL LETTER EPSILON WITH OXIA + (16#1FCB#, 16#1FCB#), -- GREEK CAPITAL LETTER ETA WITH OXIA + (16#1FCD#, 16#1FCF#), -- GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI + (16#1FD3#, 16#1FD3#), -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + (16#1FDB#, 16#1FDB#), -- GREEK CAPITAL LETTER IOTA WITH OXIA + (16#1FDD#, 16#1FDF#), -- GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI + (16#1FE3#, 16#1FE3#), -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + (16#1FEB#, 16#1FEB#), -- GREEK CAPITAL LETTER UPSILON WITH OXIA + (16#1FED#, 16#1FEF#), -- GREEK DIALYTIKA AND VARIA..GREEK VARIA + (16#1FF9#, 16#1FF9#), -- GREEK CAPITAL LETTER OMICRON WITH OXIA + (16#1FFB#, 16#1FFB#), -- GREEK CAPITAL LETTER OMEGA WITH OXIA + (16#1FFD#, 16#1FFE#), -- GREEK OXIA..GREEK DASIA + (16#2000#, 16#200A#), -- EN QUAD..HAIR SPACE + (16#2011#, 16#2011#), -- NON-BREAKING HYPHEN + (16#2017#, 16#2017#), -- DOUBLE LOW LINE + (16#2024#, 16#2026#), -- ONE DOT LEADER..HORIZONTAL ELLIPSIS + (16#202F#, 16#202F#), -- NARROW NO-BREAK SPACE + (16#2033#, 16#2034#), -- DOUBLE PRIME..TRIPLE PRIME + (16#2036#, 16#2037#), -- REVERSED DOUBLE PRIME..REVERSED TRIPLE PRIME + (16#203C#, 16#203C#), -- DOUBLE EXCLAMATION MARK + (16#203E#, 16#203E#), -- OVERLINE + (16#2047#, 16#2049#), -- DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK + (16#2057#, 16#2057#), -- QUADRUPLE PRIME + (16#205F#, 16#205F#), -- MEDIUM MATHEMATICAL SPACE + (16#2070#, 16#2070#), -- SUPERSCRIPT ZERO + (16#2071#, 16#2071#), -- SUPERSCRIPT LATIN SMALL LETTER I + (16#2074#, 16#2079#), -- SUPERSCRIPT FOUR..SUPERSCRIPT NINE + (16#207A#, 16#207C#), -- SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN + (16#207D#, 16#207D#), -- SUPERSCRIPT LEFT PARENTHESIS + (16#207E#, 16#207E#), -- SUPERSCRIPT RIGHT PARENTHESIS + (16#207F#, 16#207F#), -- SUPERSCRIPT LATIN SMALL LETTER N + (16#2080#, 16#2089#), -- SUBSCRIPT ZERO..SUBSCRIPT NINE + (16#208A#, 16#208C#), -- SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN + (16#208D#, 16#208D#), -- SUBSCRIPT LEFT PARENTHESIS + (16#208E#, 16#208E#), -- SUBSCRIPT RIGHT PARENTHESIS + (16#2090#, 16#209C#), -- LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T + (16#20A8#, 16#20A8#), -- RUPEE SIGN + (16#2100#, 16#2101#), -- ACCOUNT OF..ADDRESSED TO THE SUBJECT + (16#2102#, 16#2102#), -- DOUBLE-STRUCK CAPITAL C + (16#2103#, 16#2103#), -- DEGREE CELSIUS + (16#2105#, 16#2106#), -- CARE OF..CADA UNA + (16#2107#, 16#2107#), -- EULER CONSTANT + (16#2109#, 16#2109#), -- DEGREE FAHRENHEIT + (16#210A#, 16#2113#), -- SCRIPT SMALL G..SCRIPT SMALL L + (16#2115#, 16#2115#), -- DOUBLE-STRUCK CAPITAL N + (16#2116#, 16#2116#), -- NUMERO SIGN + (16#2119#, 16#211D#), -- DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R + (16#2120#, 16#2122#), -- SERVICE MARK..TRADE MARK SIGN + (16#2124#, 16#2124#), -- DOUBLE-STRUCK CAPITAL Z + (16#2126#, 16#2126#), -- OHM SIGN + (16#2128#, 16#2128#), -- BLACK-LETTER CAPITAL Z + (16#212A#, 16#212D#), -- KELVIN SIGN..BLACK-LETTER CAPITAL C + (16#212F#, 16#2131#), -- SCRIPT SMALL E..SCRIPT CAPITAL F + (16#2133#, 16#2134#), -- SCRIPT CAPITAL M..SCRIPT SMALL O + (16#2135#, 16#2138#), -- ALEF SYMBOL..DALET SYMBOL + (16#2139#, 16#2139#), -- INFORMATION SOURCE + (16#213B#, 16#213B#), -- FACSIMILE SIGN + (16#213C#, 16#213F#), -- DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI + (16#2140#, 16#2140#), -- DOUBLE-STRUCK N-ARY SUMMATION + (16#2145#, 16#2149#), -- DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J + (16#2150#, 16#215F#), -- VULGAR FRACTION ONE SEVENTH..FRACTION NUMERATOR ONE + (16#2160#, 16#217F#), -- ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND + (16#2189#, 16#2189#), -- VULGAR FRACTION ZERO THIRDS + (16#222C#, 16#222D#), -- DOUBLE INTEGRAL..TRIPLE INTEGRAL + (16#222F#, 16#2230#), -- SURFACE INTEGRAL..VOLUME INTEGRAL + (16#2329#, 16#2329#), -- LEFT-POINTING ANGLE BRACKET + (16#232A#, 16#232A#), -- RIGHT-POINTING ANGLE BRACKET + (16#2460#, 16#249B#), -- CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP + (16#249C#, 16#24E9#), -- PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z + (16#24EA#, 16#24EA#), -- CIRCLED DIGIT ZERO + (16#2A0C#, 16#2A0C#), -- QUADRUPLE INTEGRAL OPERATOR + (16#2A74#, 16#2A76#), -- DOUBLE COLON EQUAL..THREE CONSECUTIVE EQUALS SIGNS + (16#2ADC#, 16#2ADC#), -- FORKING + (16#2C7C#, 16#2C7D#), -- LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V + (16#2D6F#, 16#2D6F#), -- TIFINAGH MODIFIER LETTER LABIALIZATION MARK + (16#2E9F#, 16#2E9F#), -- CJK RADICAL MOTHER + (16#2EF3#, 16#2EF3#), -- CJK RADICAL C-SIMPLIFIED TURTLE + (16#2F00#, 16#2FD5#), -- KANGXI RADICAL ONE..KANGXI RADICAL FLUTE + (16#3000#, 16#3000#), -- IDEOGRAPHIC SPACE + (16#3036#, 16#3036#), -- CIRCLED POSTAL MARK + (16#3038#, 16#303A#), -- HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY + (16#309B#, 16#309C#), -- KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + (16#309F#, 16#309F#), -- HIRAGANA DIGRAPH YORI + (16#30FF#, 16#30FF#), -- KATAKANA DIGRAPH KOTO + (16#3131#, 16#318E#), -- HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE + (16#3192#, 16#3195#), -- IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK + (16#3196#, 16#319F#), -- IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK + (16#3200#, 16#321E#), -- PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU + (16#3220#, 16#3229#), -- PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN + (16#322A#, 16#3247#), -- PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO + (16#3250#, 16#3250#), -- PARTNERSHIP SIGN + (16#3251#, 16#325F#), -- CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE + (16#3260#, 16#327E#), -- CIRCLED HANGUL KIYEOK..CIRCLED HANGUL IEUNG U + (16#3280#, 16#3289#), -- CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN + (16#328A#, 16#32B0#), -- CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT + (16#32B1#, 16#32BF#), -- CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY + (16#32C0#, 16#33FF#), -- IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE GAL + (16#A69C#, 16#A69D#), -- MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN + (16#A770#, 16#A770#), -- MODIFIER LETTER US + (16#A7F8#, 16#A7F9#), -- MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE + (16#AB5C#, 16#AB5F#), -- MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK + (16#AB69#, 16#AB69#), -- MODIFIER LETTER SMALL TURNED W + (16#F900#, 16#FA0D#), -- CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D + (16#FA10#, 16#FA10#), -- CJK COMPATIBILITY IDEOGRAPH-FA10 + (16#FA12#, 16#FA12#), -- CJK COMPATIBILITY IDEOGRAPH-FA12 + (16#FA15#, 16#FA1E#), -- CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E + (16#FA20#, 16#FA20#), -- CJK COMPATIBILITY IDEOGRAPH-FA20 + (16#FA22#, 16#FA22#), -- CJK COMPATIBILITY IDEOGRAPH-FA22 + (16#FA25#, 16#FA26#), -- CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26 + (16#FA2A#, 16#FA6D#), -- CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D + (16#FA70#, 16#FAD9#), -- CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 + (16#FB00#, 16#FB06#), -- LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST + (16#FB13#, 16#FB17#), -- ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH + (16#FB1D#, 16#FB1D#), -- HEBREW LETTER YOD WITH HIRIQ + (16#FB1F#, 16#FB28#), -- HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV + (16#FB29#, 16#FB29#), -- HEBREW LETTER ALTERNATIVE PLUS SIGN + (16#FB2A#, 16#FB36#), -- HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH + (16#FB38#, 16#FB3C#), -- HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH + (16#FB3E#, 16#FB3E#), -- HEBREW LETTER MEM WITH DAGESH + (16#FB40#, 16#FB41#), -- HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH + (16#FB43#, 16#FB44#), -- HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH + (16#FB46#, 16#FBB1#), -- HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM + (16#FBD3#, 16#FD3D#), -- ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM + (16#FD50#, 16#FD8F#), -- ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM + (16#FD92#, 16#FDC7#), -- ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM + (16#FDF0#, 16#FDFB#), -- ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU + (16#FDFC#, 16#FDFC#), -- RIAL SIGN + (16#FE10#, 16#FE16#), -- PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK + (16#FE17#, 16#FE17#), -- PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET + (16#FE18#, 16#FE18#), -- PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET + (16#FE19#, 16#FE19#), -- PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS + (16#FE30#, 16#FE30#), -- PRESENTATION FORM FOR VERTICAL TWO DOT LEADER + (16#FE31#, 16#FE32#), -- PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH + (16#FE33#, 16#FE34#), -- PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE + (16#FE35#, 16#FE35#), -- PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS + (16#FE36#, 16#FE36#), -- PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS + (16#FE37#, 16#FE37#), -- PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET + (16#FE38#, 16#FE38#), -- PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET + (16#FE39#, 16#FE39#), -- PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET + (16#FE3A#, 16#FE3A#), -- PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET + (16#FE3B#, 16#FE3B#), -- PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET + (16#FE3C#, 16#FE3C#), -- PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET + (16#FE3D#, 16#FE3D#), -- PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET + (16#FE3E#, 16#FE3E#), -- PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET + (16#FE3F#, 16#FE3F#), -- PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET + (16#FE40#, 16#FE40#), -- PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET + (16#FE41#, 16#FE41#), -- PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + (16#FE42#, 16#FE42#), -- PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + (16#FE43#, 16#FE43#), -- PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + (16#FE44#, 16#FE44#), -- PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + (16#FE47#, 16#FE47#), -- PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET + (16#FE48#, 16#FE48#), -- PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET + (16#FE49#, 16#FE4C#), -- DASHED OVERLINE..DOUBLE WAVY OVERLINE + (16#FE4D#, 16#FE4F#), -- DASHED LOW LINE..WAVY LOW LINE + (16#FE50#, 16#FE52#), -- SMALL COMMA..SMALL FULL STOP + (16#FE54#, 16#FE57#), -- SMALL SEMICOLON..SMALL EXCLAMATION MARK + (16#FE58#, 16#FE58#), -- SMALL EM DASH + (16#FE59#, 16#FE59#), -- SMALL LEFT PARENTHESIS + (16#FE5A#, 16#FE5A#), -- SMALL RIGHT PARENTHESIS + (16#FE5B#, 16#FE5B#), -- SMALL LEFT CURLY BRACKET + (16#FE5C#, 16#FE5C#), -- SMALL RIGHT CURLY BRACKET + (16#FE5D#, 16#FE5D#), -- SMALL LEFT TORTOISE SHELL BRACKET + (16#FE5E#, 16#FE5E#), -- SMALL RIGHT TORTOISE SHELL BRACKET + (16#FE5F#, 16#FE61#), -- SMALL NUMBER SIGN..SMALL ASTERISK + (16#FE62#, 16#FE62#), -- SMALL PLUS SIGN + (16#FE63#, 16#FE63#), -- SMALL HYPHEN-MINUS + (16#FE64#, 16#FE66#), -- SMALL LESS-THAN SIGN..SMALL EQUALS SIGN + (16#FE68#, 16#FE68#), -- SMALL REVERSE SOLIDUS + (16#FE69#, 16#FE69#), -- SMALL DOLLAR SIGN + (16#FE6A#, 16#FE6B#), -- SMALL PERCENT SIGN..SMALL COMMERCIAL AT + (16#FE70#, 16#FE72#), -- ARABIC FATHATAN ISOLATED FORM..ARABIC DAMMATAN ISOLATED FORM + (16#FE74#, 16#FE74#), -- ARABIC KASRATAN ISOLATED FORM + (16#FE76#, 16#FEFC#), -- ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM + (16#FF01#, 16#FF03#), -- FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN + (16#FF04#, 16#FF04#), -- FULLWIDTH DOLLAR SIGN + (16#FF05#, 16#FF07#), -- FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE + (16#FF08#, 16#FF08#), -- FULLWIDTH LEFT PARENTHESIS + (16#FF09#, 16#FF09#), -- FULLWIDTH RIGHT PARENTHESIS + (16#FF0A#, 16#FF0A#), -- FULLWIDTH ASTERISK + (16#FF0B#, 16#FF0B#), -- FULLWIDTH PLUS SIGN + (16#FF0C#, 16#FF0C#), -- FULLWIDTH COMMA + (16#FF0D#, 16#FF0D#), -- FULLWIDTH HYPHEN-MINUS + (16#FF0E#, 16#FF0F#), -- FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS + (16#FF10#, 16#FF19#), -- FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE + (16#FF1A#, 16#FF1B#), -- FULLWIDTH COLON..FULLWIDTH SEMICOLON + (16#FF1C#, 16#FF1E#), -- FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN + (16#FF1F#, 16#FF20#), -- FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT + (16#FF21#, 16#FF3A#), -- FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z + (16#FF3B#, 16#FF3B#), -- FULLWIDTH LEFT SQUARE BRACKET + (16#FF3C#, 16#FF3C#), -- FULLWIDTH REVERSE SOLIDUS + (16#FF3D#, 16#FF3D#), -- FULLWIDTH RIGHT SQUARE BRACKET + (16#FF3E#, 16#FF3E#), -- FULLWIDTH CIRCUMFLEX ACCENT + (16#FF3F#, 16#FF3F#), -- FULLWIDTH LOW LINE + (16#FF40#, 16#FF40#), -- FULLWIDTH GRAVE ACCENT + (16#FF41#, 16#FF5A#), -- FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z + (16#FF5B#, 16#FF5B#), -- FULLWIDTH LEFT CURLY BRACKET + (16#FF5C#, 16#FF5C#), -- FULLWIDTH VERTICAL LINE + (16#FF5D#, 16#FF5D#), -- FULLWIDTH RIGHT CURLY BRACKET + (16#FF5E#, 16#FF5E#), -- FULLWIDTH TILDE + (16#FF5F#, 16#FF5F#), -- FULLWIDTH LEFT WHITE PARENTHESIS + (16#FF60#, 16#FF60#), -- FULLWIDTH RIGHT WHITE PARENTHESIS + (16#FF61#, 16#FF61#), -- HALFWIDTH IDEOGRAPHIC FULL STOP + (16#FF62#, 16#FF62#), -- HALFWIDTH LEFT CORNER BRACKET + (16#FF63#, 16#FF63#), -- HALFWIDTH RIGHT CORNER BRACKET + (16#FF64#, 16#FF65#), -- HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT + (16#FF66#, 16#FF6F#), -- HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU + (16#FF70#, 16#FF70#), -- HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK + (16#FF71#, 16#FF9D#), -- HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N + (16#FF9E#, 16#FF9F#), -- HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK + (16#FFA0#, 16#FFBE#), -- HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH + (16#FFC2#, 16#FFC7#), -- HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E + (16#FFCA#, 16#FFCF#), -- HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE + (16#FFD2#, 16#FFD7#), -- HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU + (16#FFDA#, 16#FFDC#), -- HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I + (16#FFE0#, 16#FFE1#), -- FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN + (16#FFE2#, 16#FFE2#), -- FULLWIDTH NOT SIGN + (16#FFE3#, 16#FFE3#), -- FULLWIDTH MACRON + (16#FFE4#, 16#FFE4#), -- FULLWIDTH BROKEN BAR + (16#FFE5#, 16#FFE6#), -- FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN + (16#FFE8#, 16#FFE8#), -- HALFWIDTH FORMS LIGHT VERTICAL + (16#FFE9#, 16#FFEC#), -- HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW + (16#FFED#, 16#FFEE#), -- HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE + (16#1D15E#, 16#1D164#), -- MUSICAL SYMBOL HALF NOTE..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE + (16#1D1BB#, 16#1D1C0#), -- MUSICAL SYMBOL MINIMA..MUSICAL SYMBOL FUSA BLACK + (16#1D400#, 16#1D454#), -- MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G + (16#1D456#, 16#1D49C#), -- MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A + (16#1D49E#, 16#1D49F#), -- MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D + (16#1D4A2#, 16#1D4A2#), -- MATHEMATICAL SCRIPT CAPITAL G + (16#1D4A5#, 16#1D4A6#), -- MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K + (16#1D4A9#, 16#1D4AC#), -- MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q + (16#1D4AE#, 16#1D4B9#), -- MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D + (16#1D4BB#, 16#1D4BB#), -- MATHEMATICAL SCRIPT SMALL F + (16#1D4BD#, 16#1D4C3#), -- MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N + (16#1D4C5#, 16#1D505#), -- MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B + (16#1D507#, 16#1D50A#), -- MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G + (16#1D50D#, 16#1D514#), -- MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q + (16#1D516#, 16#1D51C#), -- MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y + (16#1D51E#, 16#1D539#), -- MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B + (16#1D53B#, 16#1D53E#), -- MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G + (16#1D540#, 16#1D544#), -- MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M + (16#1D546#, 16#1D546#), -- MATHEMATICAL DOUBLE-STRUCK CAPITAL O + (16#1D54A#, 16#1D550#), -- MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y + (16#1D552#, 16#1D6A5#), -- MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J + (16#1D6A8#, 16#1D6C0#), -- MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA + (16#1D6C1#, 16#1D6C1#), -- MATHEMATICAL BOLD NABLA + (16#1D6C2#, 16#1D6DA#), -- MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA + (16#1D6DB#, 16#1D6DB#), -- MATHEMATICAL BOLD PARTIAL DIFFERENTIAL + (16#1D6DC#, 16#1D6FA#), -- MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA + (16#1D6FB#, 16#1D6FB#), -- MATHEMATICAL ITALIC NABLA + (16#1D6FC#, 16#1D714#), -- MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA + (16#1D715#, 16#1D715#), -- MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL + (16#1D716#, 16#1D734#), -- MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA + (16#1D735#, 16#1D735#), -- MATHEMATICAL BOLD ITALIC NABLA + (16#1D736#, 16#1D74E#), -- MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA + (16#1D74F#, 16#1D74F#), -- MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL + (16#1D750#, 16#1D76E#), -- MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA + (16#1D76F#, 16#1D76F#), -- MATHEMATICAL SANS-SERIF BOLD NABLA + (16#1D770#, 16#1D788#), -- MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA + (16#1D789#, 16#1D789#), -- MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL + (16#1D78A#, 16#1D7A8#), -- MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA + (16#1D7A9#, 16#1D7A9#), -- MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA + (16#1D7AA#, 16#1D7C2#), -- MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA + (16#1D7C3#, 16#1D7C3#), -- MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL + (16#1D7C4#, 16#1D7CB#), -- MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA + (16#1D7CE#, 16#1D7FF#), -- MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE + (16#1EE00#, 16#1EE03#), -- ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL + (16#1EE05#, 16#1EE1F#), -- ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF + (16#1EE21#, 16#1EE22#), -- ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM + (16#1EE24#, 16#1EE24#), -- ARABIC MATHEMATICAL INITIAL HEH + (16#1EE27#, 16#1EE27#), -- ARABIC MATHEMATICAL INITIAL HAH + (16#1EE29#, 16#1EE32#), -- ARABIC MATHEMATICAL INITIAL YEH..ARABIC MATHEMATICAL INITIAL QAF + (16#1EE34#, 16#1EE37#), -- ARABIC MATHEMATICAL INITIAL SHEEN..ARABIC MATHEMATICAL INITIAL KHAH + (16#1EE39#, 16#1EE39#), -- ARABIC MATHEMATICAL INITIAL DAD + (16#1EE3B#, 16#1EE3B#), -- ARABIC MATHEMATICAL INITIAL GHAIN + (16#1EE42#, 16#1EE42#), -- ARABIC MATHEMATICAL TAILED JEEM + (16#1EE47#, 16#1EE47#), -- ARABIC MATHEMATICAL TAILED HAH + (16#1EE49#, 16#1EE49#), -- ARABIC MATHEMATICAL TAILED YEH + (16#1EE4B#, 16#1EE4B#), -- ARABIC MATHEMATICAL TAILED LAM + (16#1EE4D#, 16#1EE4F#), -- ARABIC MATHEMATICAL TAILED NOON..ARABIC MATHEMATICAL TAILED AIN + (16#1EE51#, 16#1EE52#), -- ARABIC MATHEMATICAL TAILED SAD..ARABIC MATHEMATICAL TAILED QAF + (16#1EE54#, 16#1EE54#), -- ARABIC MATHEMATICAL TAILED SHEEN + (16#1EE57#, 16#1EE57#), -- ARABIC MATHEMATICAL TAILED KHAH + (16#1EE59#, 16#1EE59#), -- ARABIC MATHEMATICAL TAILED DAD + (16#1EE5B#, 16#1EE5B#), -- ARABIC MATHEMATICAL TAILED GHAIN + (16#1EE5D#, 16#1EE5D#), -- ARABIC MATHEMATICAL TAILED DOTLESS NOON + (16#1EE5F#, 16#1EE5F#), -- ARABIC MATHEMATICAL TAILED DOTLESS QAF + (16#1EE61#, 16#1EE62#), -- ARABIC MATHEMATICAL STRETCHED BEH..ARABIC MATHEMATICAL STRETCHED JEEM + (16#1EE64#, 16#1EE64#), -- ARABIC MATHEMATICAL STRETCHED HEH + (16#1EE67#, 16#1EE6A#), -- ARABIC MATHEMATICAL STRETCHED HAH..ARABIC MATHEMATICAL STRETCHED KAF + (16#1EE6C#, 16#1EE72#), -- ARABIC MATHEMATICAL STRETCHED MEEM..ARABIC MATHEMATICAL STRETCHED QAF + (16#1EE74#, 16#1EE77#), -- ARABIC MATHEMATICAL STRETCHED SHEEN..ARABIC MATHEMATICAL STRETCHED KHAH + (16#1EE79#, 16#1EE7C#), -- ARABIC MATHEMATICAL STRETCHED DAD..ARABIC MATHEMATICAL STRETCHED DOTLESS BEH + (16#1EE7E#, 16#1EE7E#), -- ARABIC MATHEMATICAL STRETCHED DOTLESS FEH + (16#1EE80#, 16#1EE89#), -- ARABIC MATHEMATICAL LOOPED ALEF..ARABIC MATHEMATICAL LOOPED YEH + (16#1EE8B#, 16#1EE9B#), -- ARABIC MATHEMATICAL LOOPED LAM..ARABIC MATHEMATICAL LOOPED GHAIN + (16#1EEA1#, 16#1EEA3#), -- ARABIC MATHEMATICAL DOUBLE-STRUCK BEH..ARABIC MATHEMATICAL DOUBLE-STRUCK DAL + (16#1EEA5#, 16#1EEA9#), -- ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH + (16#1EEAB#, 16#1EEBB#), -- ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN + (16#1F100#, 16#1F10A#), -- DIGIT ZERO FULL STOP..DIGIT NINE COMMA + (16#1F110#, 16#1F12E#), -- PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ + (16#1F130#, 16#1F14F#), -- SQUARED LATIN CAPITAL LETTER A..SQUARED WC + (16#1F16A#, 16#1F16C#), -- RAISED MC SIGN..RAISED MR SIGN + (16#1F190#, 16#1F190#), -- SQUARE DJ + (16#1F200#, 16#1F202#), -- SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA + (16#1F210#, 16#1F23B#), -- SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D + (16#1F240#, 16#1F248#), -- TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 + (16#1F250#, 16#1F251#), -- CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT + (16#1FBF0#, 16#1FBF9#), -- SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE + (16#2F800#, 16#2FA1D#)); -- CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D + + pragma Warnings (On); + -- Temporary until pragma Warnings at start can be activated ??? + type Decomposition_Mapping is record Item : UTF_32; First_Char_Mapping : UTF_32; @@ -12001,6 +12395,15 @@ package body System.UTF_32 is return Non_Graphic (C); end Is_UTF_32_Non_Graphic; + -------------------- + -- Is_UTF_32_NFKC -- + -------------------- + + function Is_UTF_32_NFKC (U : UTF_32) return Boolean is + begin + return U < 160 or else Range_Search (U, UTF_32_NFKC_QC_No) = 0; + end Is_UTF_32_NFKC; + --------------------- -- Is_UTF_32_Other -- --------------------- diff --git a/gcc/ada/libgnat/s-utf_32.ads b/gcc/ada/libgnat/s-utf_32.ads index b8e4e3e903e..e3f0e003c81 100644 --- a/gcc/ada/libgnat/s-utf_32.ads +++ b/gcc/ada/libgnat/s-utf_32.ads @@ -189,6 +189,12 @@ package System.UTF_32 is -- letters to upper case using this routine. A corresponding routine to -- fold to lower case is also provided. + function Is_UTF_32_NFKC (U : UTF_32) return Boolean; + pragma Inline (Is_UTF_32_NFKC); + -- Return True if U could be present in a string normalized to + -- Normalization Form KC (as defined by Clause 21 of ISO/IEC 10646:2017), + -- otherwise returns False. + function Is_UTF_32_Basic (U : UTF_32) return Boolean; pragma Inline (Is_UTF_32_Basic); -- Return True if U has no Decomposition Mapping in the code charts of diff --git a/gcc/ada/scng.adb b/gcc/ada/scng.adb index fd3dacc9af1..2bac3a8b09b 100644 --- a/gcc/ada/scng.adb +++ b/gcc/ada/scng.adb @@ -2485,10 +2485,17 @@ package body Scng is ("wide character not allowed in identifier", Wptr); end if; + -- AI12-0004: An identifier shall only contain characters + -- that may be present in Normalization Form KC. + + if not Is_UTF_32_NFKC (UTF_32 (Code)) then + Error_Msg + ("invalid wide character in identifier", Wptr); + -- If OK letter, store it folding to upper case. Note -- that we include the folded letter in the checksum. - if Is_UTF_32_Letter (Cat) then + elsif Is_UTF_32_Letter (Cat) then Code := Char_Code (UTF_32_To_Upper_Case (UTF_32 (Code))); Accumulate_Checksum (Code); -- 2.30.2