1# Utilities to generate Unicode data for glibc from upstream Unicode data. 2# 3# Copyright (C) 2014-2022 Free Software Foundation, Inc. 4# This file is part of the GNU C Library. 5# 6# The GNU C Library is free software; you can redistribute it and/or 7# modify it under the terms of the GNU Lesser General Public 8# License as published by the Free Software Foundation; either 9# version 2.1 of the License, or (at your option) any later version. 10# 11# The GNU C Library is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14# Lesser General Public License for more details. 15# 16# You should have received a copy of the GNU Lesser General Public 17# License along with the GNU C Library; if not, see 18# <https://www.gnu.org/licenses/>. 19 20''' 21This module contains utilities used by the scripts to generate 22Unicode data for glibc from upstream Unicode data files. 23''' 24 25import sys 26import re 27 28 29# Common locale header. 30COMMENT_HEADER = """ 31% This file is part of the GNU C Library and contains locale data. 32% The Free Software Foundation does not claim any copyright interest 33% in the locale data contained in this file. The foregoing does not 34% affect the license of the GNU C Library as a whole. It does not 35% exempt you from the conditions of the license if your use would 36% otherwise be governed by that license. 37""" 38 39# Dictionary holding the entire contents of the UnicodeData.txt file 40# 41# Contents of this dictionary look like this: 42# 43# {0: {'category': 'Cc', 44# 'title': None, 45# 'digit': '', 46# 'name': '<control>', 47# 'bidi': 'BN', 48# 'combining': '0', 49# 'comment': '', 50# 'oldname': 'NULL', 51# 'decomposition': '', 52# 'upper': None, 53# 'mirrored': 'N', 54# 'lower': None, 55# 'decdigit': '', 56# 'numeric': ''}, 57# … 58# } 59UNICODE_ATTRIBUTES = {} 60 61# Dictionary holding the entire contents of the DerivedCoreProperties.txt file 62# 63# Contents of this dictionary look like this: 64# 65# {917504: ['Default_Ignorable_Code_Point'], 66# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], 67# … 68# } 69DERIVED_CORE_PROPERTIES = {} 70 71# Dictionary holding the entire contents of the EastAsianWidths.txt file 72# 73# Contents of this dictionary look like this: 74# 75# {0: 'N', … , 45430: 'W', …} 76EAST_ASIAN_WIDTHS = {} 77 78def fill_attribute(code_point, fields): 79 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. 80 81 One entry in the UNICODE_ATTRIBUTES dictionary represents one line 82 in the UnicodeData.txt file. 83 84 ''' 85 UNICODE_ATTRIBUTES[code_point] = { 86 'name': fields[1], # Character name 87 'category': fields[2], # General category 88 'combining': fields[3], # Canonical combining classes 89 'bidi': fields[4], # Bidirectional category 90 'decomposition': fields[5], # Character decomposition mapping 91 'decdigit': fields[6], # Decimal digit value 92 'digit': fields[7], # Digit value 93 'numeric': fields[8], # Numeric value 94 'mirrored': fields[9], # mirrored 95 'oldname': fields[10], # Old Unicode 1.0 name 96 'comment': fields[11], # comment 97 # Uppercase mapping 98 'upper': int(fields[12], 16) if fields[12] else None, 99 # Lowercase mapping 100 'lower': int(fields[13], 16) if fields[13] else None, 101 # Titlecase mapping 102 'title': int(fields[14], 16) if fields[14] else None, 103 } 104 105def fill_attributes(filename): 106 '''Stores the entire contents of the UnicodeData.txt file 107 in the UNICODE_ATTRIBUTES dictionary. 108 109 A typical line for a single code point in UnicodeData.txt looks 110 like this: 111 112 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 113 114 Code point ranges are indicated by pairs of lines like this: 115 116 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 117 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 118 ''' 119 with open(filename, mode='r') as unicode_data_file: 120 fields_start = [] 121 for line in unicode_data_file: 122 fields = line.strip().split(';') 123 if len(fields) != 15: 124 sys.stderr.write( 125 'short line in file "%(f)s": %(l)s\n' %{ 126 'f': filename, 'l': line}) 127 exit(1) 128 if fields[2] == 'Cs': 129 # Surrogates are UTF-16 artefacts, 130 # not real characters. Ignore them. 131 fields_start = [] 132 continue 133 if fields[1].endswith(', First>'): 134 fields_start = fields 135 fields_start[1] = fields_start[1].split(',')[0][1:] 136 continue 137 if fields[1].endswith(', Last>'): 138 fields[1] = fields[1].split(',')[0][1:] 139 if fields[1:] != fields_start[1:]: 140 sys.stderr.write( 141 'broken code point range in file "%(f)s": %(l)s\n' %{ 142 'f': filename, 'l': line}) 143 exit(1) 144 for code_point in range( 145 int(fields_start[0], 16), 146 int(fields[0], 16)+1): 147 fill_attribute(code_point, fields) 148 fields_start = [] 149 continue 150 fill_attribute(int(fields[0], 16), fields) 151 fields_start = [] 152 153def fill_derived_core_properties(filename): 154 '''Stores the entire contents of the DerivedCoreProperties.txt file 155 in the DERIVED_CORE_PROPERTIES dictionary. 156 157 Lines in DerivedCoreProperties.txt are either a code point range like 158 this: 159 160 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 161 162 or a single code point like this: 163 164 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR 165 166 ''' 167 with open(filename, mode='r') as derived_core_properties_file: 168 for line in derived_core_properties_file: 169 match = re.match( 170 r'^(?P<codepoint1>[0-9A-F]{4,6})' 171 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' 172 + r'\s*;\s*(?P<property>[a-zA-Z_]+)', 173 line) 174 if not match: 175 continue 176 start = match.group('codepoint1') 177 end = match.group('codepoint2') 178 if not end: 179 end = start 180 for code_point in range(int(start, 16), int(end, 16)+1): 181 prop = match.group('property') 182 if code_point in DERIVED_CORE_PROPERTIES: 183 DERIVED_CORE_PROPERTIES[code_point].append(prop) 184 else: 185 DERIVED_CORE_PROPERTIES[code_point] = [prop] 186 187def fill_east_asian_widths(filename): 188 '''Stores the entire contents of the EastAsianWidths.txt file 189 in the EAST_ASIAN_WIDTHS dictionary. 190 191 Lines in EastAsianWidths.txt are either a code point range like 192 this: 193 194 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> 195 196 or a single code point like this: 197 198 A015;W # Lm YI SYLLABLE WU 199 ''' 200 with open(filename, mode='r') as east_asian_widths_file: 201 for line in east_asian_widths_file: 202 match = re.match( 203 r'^(?P<codepoint1>[0-9A-F]{4,6})' 204 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' 205 +r'\s*;\s*(?P<property>[a-zA-Z]+)', 206 line) 207 if not match: 208 continue 209 start = match.group('codepoint1') 210 end = match.group('codepoint2') 211 if not end: 212 end = start 213 for code_point in range(int(start, 16), int(end, 16)+1): 214 EAST_ASIAN_WIDTHS[code_point] = match.group('property') 215 216def to_upper(code_point): 217 '''Returns the code point of the uppercase version 218 of the given code point''' 219 if (UNICODE_ATTRIBUTES[code_point]['name'] 220 and UNICODE_ATTRIBUTES[code_point]['upper']): 221 return UNICODE_ATTRIBUTES[code_point]['upper'] 222 else: 223 return code_point 224 225def to_lower(code_point): 226 '''Returns the code point of the lowercase version 227 of the given code point''' 228 if (UNICODE_ATTRIBUTES[code_point]['name'] 229 and UNICODE_ATTRIBUTES[code_point]['lower']): 230 return UNICODE_ATTRIBUTES[code_point]['lower'] 231 else: 232 return code_point 233 234def to_upper_turkish(code_point): 235 '''Returns the code point of the Turkish uppercase version 236 of the given code point''' 237 if code_point == 0x0069: 238 return 0x0130 239 return to_upper(code_point) 240 241def to_lower_turkish(code_point): 242 '''Returns the code point of the Turkish lowercase version 243 of the given code point''' 244 if code_point == 0x0049: 245 return 0x0131 246 return to_lower(code_point) 247 248def to_title(code_point): 249 '''Returns the code point of the titlecase version 250 of the given code point''' 251 if (UNICODE_ATTRIBUTES[code_point]['name'] 252 and UNICODE_ATTRIBUTES[code_point]['title']): 253 return UNICODE_ATTRIBUTES[code_point]['title'] 254 else: 255 return code_point 256 257def is_upper(code_point): 258 '''Checks whether the character with this code point is uppercase''' 259 return (to_lower(code_point) != code_point 260 or (code_point in DERIVED_CORE_PROPERTIES 261 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) 262 263def is_lower(code_point): 264 '''Checks whether the character with this code point is lowercase''' 265 # Some characters are defined as “Lowercase” in 266 # DerivedCoreProperties.txt but do not have a mapping to upper 267 # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is 268 # one of these. 269 return (to_upper(code_point) != code_point 270 # <U00DF> is lowercase, but without simple to_upper mapping. 271 or code_point == 0x00DF 272 or (code_point in DERIVED_CORE_PROPERTIES 273 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) 274 275def is_alpha(code_point): 276 '''Checks whether the character with this code point is alphabetic''' 277 return ((code_point in DERIVED_CORE_PROPERTIES 278 and 279 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) 280 or 281 # Consider all the non-ASCII digits as alphabetic. 282 # ISO C 99 forbids us to have them in category “digit”, 283 # but we want iswalnum to return true on them. 284 (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' 285 and not (code_point >= 0x0030 and code_point <= 0x0039))) 286 287def is_digit(code_point): 288 '''Checks whether the character with this code point is a digit''' 289 if False: 290 return (UNICODE_ATTRIBUTES[code_point]['name'] 291 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') 292 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without 293 # a zero. Must add <0> in front of them by hand. 294 else: 295 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 296 # takes it away: 297 # 7.25.2.1.5: 298 # The iswdigit function tests for any wide character that 299 # corresponds to a decimal-digit character (as defined in 5.2.1). 300 # 5.2.1: 301 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 302 return (code_point >= 0x0030 and code_point <= 0x0039) 303 304def is_outdigit(code_point): 305 '''Checks whether the character with this code point is outdigit''' 306 return (code_point >= 0x0030 and code_point <= 0x0039) 307 308def is_blank(code_point): 309 '''Checks whether the character with this code point is blank''' 310 return (code_point == 0x0009 # '\t' 311 # Category Zs without mention of '<noBreak>' 312 or (UNICODE_ATTRIBUTES[code_point]['name'] 313 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' 314 and '<noBreak>' not in 315 UNICODE_ATTRIBUTES[code_point]['decomposition'])) 316 317def is_space(code_point): 318 '''Checks whether the character with this code point is a space''' 319 # Don’t make U+00A0 a space. Non-breaking space means that all programs 320 # should treat it like a punctuation character, not like a space. 321 return (code_point == 0x0020 # ' ' 322 or code_point == 0x000C # '\f' 323 or code_point == 0x000A # '\n' 324 or code_point == 0x000D # '\r' 325 or code_point == 0x0009 # '\t' 326 or code_point == 0x000B # '\v' 327 # Categories Zl, Zp, and Zs without mention of "<noBreak>" 328 or (UNICODE_ATTRIBUTES[code_point]['name'] 329 and 330 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] 331 or 332 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] 333 and 334 '<noBreak>' not in 335 UNICODE_ATTRIBUTES[code_point]['decomposition'])))) 336 337def is_cntrl(code_point): 338 '''Checks whether the character with this code point is 339 a control character''' 340 return (UNICODE_ATTRIBUTES[code_point]['name'] 341 and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' 342 or 343 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) 344 345def is_xdigit(code_point): 346 '''Checks whether the character with this code point is 347 a hexadecimal digit''' 348 if False: 349 return (is_digit(code_point) 350 or (code_point >= 0x0041 and code_point <= 0x0046) 351 or (code_point >= 0x0061 and code_point <= 0x0066)) 352 else: 353 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 354 # takes it away: 355 # 7.25.2.1.12: 356 # The iswxdigit function tests for any wide character that 357 # corresponds to a hexadecimal-digit character (as defined 358 # in 6.4.4.1). 359 # 6.4.4.1: 360 # hexadecimal-digit: one of 361 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F 362 return ((code_point >= 0x0030 and code_point <= 0x0039) 363 or (code_point >= 0x0041 and code_point <= 0x0046) 364 or (code_point >= 0x0061 and code_point <= 0x0066)) 365 366def is_graph(code_point): 367 '''Checks whether the character with this code point is 368 a graphical character''' 369 return (UNICODE_ATTRIBUTES[code_point]['name'] 370 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' 371 and not is_space(code_point)) 372 373def is_print(code_point): 374 '''Checks whether the character with this code point is printable''' 375 return (UNICODE_ATTRIBUTES[code_point]['name'] 376 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' 377 and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) 378 379def is_punct(code_point): 380 '''Checks whether the character with this code point is punctuation''' 381 if False: 382 return (UNICODE_ATTRIBUTES[code_point]['name'] 383 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) 384 else: 385 # The traditional POSIX definition of punctuation is every graphic, 386 # non-alphanumeric character. 387 return (is_graph(code_point) 388 and not is_alpha(code_point) 389 and not is_digit(code_point)) 390 391def is_combining(code_point): 392 '''Checks whether the character with this code point is 393 a combining character''' 394 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt 395 # file. In 3.0.1 it was identical to the union of the general categories 396 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the 397 # PropList.txt file, so we take the latter definition. 398 return (UNICODE_ATTRIBUTES[code_point]['name'] 399 and 400 UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) 401 402def is_combining_level3(code_point): 403 '''Checks whether the character with this code point is 404 a combining level3 character''' 405 return (is_combining(code_point) 406 and 407 int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) 408 409def ucs_symbol(code_point): 410 '''Return the UCS symbol string for a Unicode character.''' 411 if code_point < 0x10000: 412 return '<U{:04X}>'.format(code_point) 413 else: 414 return '<U{:08X}>'.format(code_point) 415 416def ucs_symbol_range(code_point_low, code_point_high): 417 '''Returns a string UCS symbol string for a code point range. 418 419 Example: 420 421 <U0041>..<U005A> 422 ''' 423 return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) 424 425def verifications(): 426 '''Tests whether the is_* functions observe the known restrictions''' 427 for code_point in sorted(UNICODE_ATTRIBUTES): 428 # toupper restriction: "Only characters specified for the keywords 429 # lower and upper shall be specified. 430 if (to_upper(code_point) != code_point 431 and not (is_lower(code_point) or is_upper(code_point))): 432 sys.stderr.write( 433 ('%(sym)s is not upper|lower ' 434 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ 435 'sym': ucs_symbol(code_point), 436 'c': code_point, 437 'uc': to_upper(code_point)}) 438 # tolower restriction: "Only characters specified for the keywords 439 # lower and upper shall be specified. 440 if (to_lower(code_point) != code_point 441 and not (is_lower(code_point) or is_upper(code_point))): 442 sys.stderr.write( 443 ('%(sym)s is not upper|lower ' 444 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ 445 'sym': ucs_symbol(code_point), 446 'c': code_point, 447 'uc': to_lower(code_point)}) 448 # alpha restriction: "Characters classified as either upper or lower 449 # shall automatically belong to this class. 450 if ((is_lower(code_point) or is_upper(code_point)) 451 and not is_alpha(code_point)): 452 sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ 453 'sym': ucs_symbol(code_point)}) 454 # alpha restriction: “No character specified for the keywords cntrl, 455 # digit, punct or space shall be specified.” 456 if (is_alpha(code_point) and is_cntrl(code_point)): 457 sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ 458 'sym': ucs_symbol(code_point)}) 459 if (is_alpha(code_point) and is_digit(code_point)): 460 sys.stderr.write('%(sym)s is alpha and digit\n' %{ 461 'sym': ucs_symbol(code_point)}) 462 if (is_alpha(code_point) and is_punct(code_point)): 463 sys.stderr.write('%(sym)s is alpha and punct\n' %{ 464 'sym': ucs_symbol(code_point)}) 465 if (is_alpha(code_point) and is_space(code_point)): 466 sys.stderr.write('%(sym)s is alpha and space\n' %{ 467 'sym': ucs_symbol(code_point)}) 468 # space restriction: “No character specified for the keywords upper, 469 # lower, alpha, digit, graph or xdigit shall be specified.” 470 # upper, lower, alpha already checked above. 471 if (is_space(code_point) and is_digit(code_point)): 472 sys.stderr.write('%(sym)s is space and digit\n' %{ 473 'sym': ucs_symbol(code_point)}) 474 if (is_space(code_point) and is_graph(code_point)): 475 sys.stderr.write('%(sym)s is space and graph\n' %{ 476 'sym': ucs_symbol(code_point)}) 477 if (is_space(code_point) and is_xdigit(code_point)): 478 sys.stderr.write('%(sym)s is space and xdigit\n' %{ 479 'sym': ucs_symbol(code_point)}) 480 # cntrl restriction: “No character specified for the keywords upper, 481 # lower, alpha, digit, punct, graph, print or xdigit shall be 482 # specified.” upper, lower, alpha already checked above. 483 if (is_cntrl(code_point) and is_digit(code_point)): 484 sys.stderr.write('%(sym)s is cntrl and digit\n' %{ 485 'sym': ucs_symbol(code_point)}) 486 if (is_cntrl(code_point) and is_punct(code_point)): 487 sys.stderr.write('%(sym)s is cntrl and punct\n' %{ 488 'sym': ucs_symbol(code_point)}) 489 if (is_cntrl(code_point) and is_graph(code_point)): 490 sys.stderr.write('%(sym)s is cntrl and graph\n' %{ 491 'sym': ucs_symbol(code_point)}) 492 if (is_cntrl(code_point) and is_print(code_point)): 493 sys.stderr.write('%(sym)s is cntrl and print\n' %{ 494 'sym': ucs_symbol(code_point)}) 495 if (is_cntrl(code_point) and is_xdigit(code_point)): 496 sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ 497 'sym': ucs_symbol(code_point)}) 498 # punct restriction: “No character specified for the keywords upper, 499 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall 500 # be specified.” upper, lower, alpha, cntrl already checked above. 501 if (is_punct(code_point) and is_digit(code_point)): 502 sys.stderr.write('%(sym)s is punct and digit\n' %{ 503 'sym': ucs_symbol(code_point)}) 504 if (is_punct(code_point) and is_xdigit(code_point)): 505 sys.stderr.write('%(sym)s is punct and xdigit\n' %{ 506 'sym': ucs_symbol(code_point)}) 507 if (is_punct(code_point) and code_point == 0x0020): 508 sys.stderr.write('%(sym)s is punct\n' %{ 509 'sym': ucs_symbol(code_point)}) 510 # graph restriction: “No character specified for the keyword cntrl 511 # shall be specified.” Already checked above. 512 513 # print restriction: “No character specified for the keyword cntrl 514 # shall be specified.” Already checked above. 515 516 # graph - print relation: differ only in the <space> character. 517 # How is this possible if there are more than one space character?! 518 # I think susv2/xbd/locale.html should speak of “space characters”, 519 # not “space character”. 520 if (is_print(code_point) 521 and not (is_graph(code_point) or is_space(code_point))): 522 sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ 523 'sym': unicode_utils.ucs_symbol(code_point)}) 524 if (not is_print(code_point) 525 and (is_graph(code_point) or code_point == 0x0020)): 526 sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ 527 'sym': unicode_utils.ucs_symbol(code_point)}) 528