1# Utilities to generate Unicode data for glibc from upstream Unicode data.
2#
3# Copyright (C) 2014-2022 Free Software Foundation, Inc.
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
18# <https://www.gnu.org/licenses/>.
19
20'''
21This module contains utilities used by the scripts to generate
22Unicode data for glibc from upstream Unicode data files.
23'''
24
25import sys
26import re
27
28
29# Common locale header.
30COMMENT_HEADER = """
31% This file is part of the GNU C Library and contains locale data.
32% The Free Software Foundation does not claim any copyright interest
33% in the locale data contained in this file.  The foregoing does not
34% affect the license of the GNU C Library as a whole.  It does not
35% exempt you from the conditions of the license if your use would
36% otherwise be governed by that license.
37"""
38
39# Dictionary holding the entire contents of the UnicodeData.txt file
40#
41# Contents of this dictionary look like this:
42#
43# {0: {'category': 'Cc',
44#      'title': None,
45#      'digit': '',
46#      'name': '<control>',
47#      'bidi': 'BN',
48#      'combining': '0',
49#      'comment': '',
50#      'oldname': 'NULL',
51#      'decomposition': '',
52#      'upper': None,
53#      'mirrored': 'N',
54#      'lower': None,
55#      'decdigit': '',
56#      'numeric': ''},
57#      …
58# }
59UNICODE_ATTRIBUTES = {}
60
61# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
62#
63# Contents of this dictionary look like this:
64#
65# {917504: ['Default_Ignorable_Code_Point'],
66#  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
67#  …
68# }
69DERIVED_CORE_PROPERTIES = {}
70
71# Dictionary holding the entire contents of the EastAsianWidths.txt file
72#
73# Contents of this dictionary look like this:
74#
75# {0: 'N', … , 45430: 'W', …}
76EAST_ASIAN_WIDTHS = {}
77
78def fill_attribute(code_point, fields):
79    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
80
81    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
82    in the UnicodeData.txt file.
83
84    '''
85    UNICODE_ATTRIBUTES[code_point] =  {
86        'name': fields[1],          # Character name
87        'category': fields[2],      # General category
88        'combining': fields[3],     # Canonical combining classes
89        'bidi': fields[4],          # Bidirectional category
90        'decomposition': fields[5], # Character decomposition mapping
91        'decdigit': fields[6],      # Decimal digit value
92        'digit': fields[7],         # Digit value
93        'numeric': fields[8],       # Numeric value
94        'mirrored': fields[9],      # mirrored
95        'oldname': fields[10],      # Old Unicode 1.0 name
96        'comment': fields[11],      # comment
97        # Uppercase mapping
98        'upper': int(fields[12], 16) if fields[12] else None,
99        # Lowercase mapping
100        'lower': int(fields[13], 16) if fields[13] else None,
101        # Titlecase mapping
102        'title': int(fields[14], 16) if fields[14] else None,
103    }
104
105def fill_attributes(filename):
106    '''Stores the entire contents of the UnicodeData.txt file
107    in the UNICODE_ATTRIBUTES dictionary.
108
109    A typical line for a single code point in UnicodeData.txt looks
110    like this:
111
112    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
113
114    Code point ranges are indicated by pairs of lines like this:
115
116    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
117    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
118    '''
119    with open(filename, mode='r') as unicode_data_file:
120        fields_start = []
121        for line in unicode_data_file:
122            fields = line.strip().split(';')
123            if len(fields) != 15:
124                sys.stderr.write(
125                    'short line in file "%(f)s": %(l)s\n' %{
126                    'f': filename, 'l': line})
127                exit(1)
128            if fields[2] == 'Cs':
129                # Surrogates are UTF-16 artefacts,
130                # not real characters. Ignore them.
131                fields_start = []
132                continue
133            if fields[1].endswith(', First>'):
134                fields_start = fields
135                fields_start[1] = fields_start[1].split(',')[0][1:]
136                continue
137            if fields[1].endswith(', Last>'):
138                fields[1] = fields[1].split(',')[0][1:]
139                if fields[1:] != fields_start[1:]:
140                    sys.stderr.write(
141                        'broken code point range in file "%(f)s": %(l)s\n' %{
142                            'f': filename, 'l': line})
143                    exit(1)
144                for code_point in range(
145                        int(fields_start[0], 16),
146                        int(fields[0], 16)+1):
147                    fill_attribute(code_point, fields)
148                fields_start = []
149                continue
150            fill_attribute(int(fields[0], 16), fields)
151            fields_start = []
152
153def fill_derived_core_properties(filename):
154    '''Stores the entire contents of the DerivedCoreProperties.txt file
155    in the DERIVED_CORE_PROPERTIES dictionary.
156
157    Lines in DerivedCoreProperties.txt are either a code point range like
158    this:
159
160    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
161
162    or a single code point like this:
163
164    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
165
166    '''
167    with open(filename, mode='r') as derived_core_properties_file:
168        for line in derived_core_properties_file:
169            match = re.match(
170                r'^(?P<codepoint1>[0-9A-F]{4,6})'
171                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
172                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
173                line)
174            if not match:
175                continue
176            start = match.group('codepoint1')
177            end = match.group('codepoint2')
178            if not end:
179                end = start
180            for code_point in range(int(start, 16), int(end, 16)+1):
181                prop = match.group('property')
182                if code_point in DERIVED_CORE_PROPERTIES:
183                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
184                else:
185                    DERIVED_CORE_PROPERTIES[code_point] = [prop]
186
187def fill_east_asian_widths(filename):
188    '''Stores the entire contents of the EastAsianWidths.txt file
189    in the EAST_ASIAN_WIDTHS dictionary.
190
191    Lines in EastAsianWidths.txt are either a code point range like
192    this:
193
194    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
195
196    or a single code point like this:
197
198    A015;W           # Lm         YI SYLLABLE WU
199    '''
200    with open(filename, mode='r') as east_asian_widths_file:
201        for line in east_asian_widths_file:
202            match = re.match(
203                r'^(?P<codepoint1>[0-9A-F]{4,6})'
204                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
205                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
206                line)
207            if not match:
208                continue
209            start = match.group('codepoint1')
210            end = match.group('codepoint2')
211            if not end:
212                end = start
213            for code_point in range(int(start, 16), int(end, 16)+1):
214                EAST_ASIAN_WIDTHS[code_point] = match.group('property')
215
216def to_upper(code_point):
217    '''Returns the code point of the uppercase version
218    of the given code point'''
219    if (UNICODE_ATTRIBUTES[code_point]['name']
220        and UNICODE_ATTRIBUTES[code_point]['upper']):
221        return UNICODE_ATTRIBUTES[code_point]['upper']
222    else:
223        return code_point
224
225def to_lower(code_point):
226    '''Returns the code point of the lowercase version
227    of the given code point'''
228    if (UNICODE_ATTRIBUTES[code_point]['name']
229        and UNICODE_ATTRIBUTES[code_point]['lower']):
230        return UNICODE_ATTRIBUTES[code_point]['lower']
231    else:
232        return code_point
233
234def to_upper_turkish(code_point):
235    '''Returns the code point of the Turkish uppercase version
236    of the given code point'''
237    if code_point == 0x0069:
238        return 0x0130
239    return to_upper(code_point)
240
241def to_lower_turkish(code_point):
242    '''Returns the code point of the Turkish lowercase version
243    of the given code point'''
244    if code_point == 0x0049:
245        return 0x0131
246    return to_lower(code_point)
247
248def to_title(code_point):
249    '''Returns the code point of the titlecase version
250    of the given code point'''
251    if (UNICODE_ATTRIBUTES[code_point]['name']
252        and UNICODE_ATTRIBUTES[code_point]['title']):
253        return UNICODE_ATTRIBUTES[code_point]['title']
254    else:
255        return code_point
256
257def is_upper(code_point):
258    '''Checks whether the character with this code point is uppercase'''
259    return (to_lower(code_point) != code_point
260            or (code_point in DERIVED_CORE_PROPERTIES
261                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
262
263def is_lower(code_point):
264    '''Checks whether the character with this code point is lowercase'''
265    # Some characters are defined as “Lowercase” in
266    # DerivedCoreProperties.txt but do not have a mapping to upper
267    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
268    # one of these.
269    return (to_upper(code_point) != code_point
270            # <U00DF> is lowercase, but without simple to_upper mapping.
271            or code_point == 0x00DF
272            or (code_point in DERIVED_CORE_PROPERTIES
273                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
274
275def is_alpha(code_point):
276    '''Checks whether the character with this code point is alphabetic'''
277    return ((code_point in DERIVED_CORE_PROPERTIES
278             and
279             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
280            or
281            # Consider all the non-ASCII digits as alphabetic.
282            # ISO C 99 forbids us to have them in category “digit”,
283            # but we want iswalnum to return true on them.
284            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
285             and not (code_point >= 0x0030 and code_point <= 0x0039)))
286
287def is_digit(code_point):
288    '''Checks whether the character with this code point is a digit'''
289    if False:
290        return (UNICODE_ATTRIBUTES[code_point]['name']
291                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
292        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
293        # a zero.  Must add <0> in front of them by hand.
294    else:
295        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
296        # takes it away:
297        # 7.25.2.1.5:
298        #    The iswdigit function tests for any wide character that
299        #    corresponds to a decimal-digit character (as defined in 5.2.1).
300        # 5.2.1:
301        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
302        return (code_point >= 0x0030 and code_point <= 0x0039)
303
304def is_outdigit(code_point):
305    '''Checks whether the character with this code point is outdigit'''
306    return (code_point >= 0x0030 and code_point <= 0x0039)
307
308def is_blank(code_point):
309    '''Checks whether the character with this code point is blank'''
310    return (code_point == 0x0009 # '\t'
311            # Category Zs without mention of '<noBreak>'
312            or (UNICODE_ATTRIBUTES[code_point]['name']
313                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
314                and '<noBreak>' not in
315                UNICODE_ATTRIBUTES[code_point]['decomposition']))
316
317def is_space(code_point):
318    '''Checks whether the character with this code point is a space'''
319    # Don’t make U+00A0 a space. Non-breaking space means that all programs
320    # should treat it like a punctuation character, not like a space.
321    return (code_point == 0x0020 # ' '
322            or code_point == 0x000C # '\f'
323            or code_point == 0x000A # '\n'
324            or code_point == 0x000D # '\r'
325            or code_point == 0x0009 # '\t'
326            or code_point == 0x000B # '\v'
327            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
328            or (UNICODE_ATTRIBUTES[code_point]['name']
329                and
330                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
331                 or
332                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
333                  and
334                  '<noBreak>' not in
335                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))
336
337def is_cntrl(code_point):
338    '''Checks whether the character with this code point is
339    a control character'''
340    return (UNICODE_ATTRIBUTES[code_point]['name']
341            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
342                 or
343                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
344
345def is_xdigit(code_point):
346    '''Checks whether the character with this code point is
347    a hexadecimal digit'''
348    if False:
349        return (is_digit(code_point)
350                or (code_point >= 0x0041 and code_point <= 0x0046)
351                or (code_point >= 0x0061 and code_point <= 0x0066))
352    else:
353        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
354        # takes it away:
355        # 7.25.2.1.12:
356        #    The iswxdigit function tests for any wide character that
357        #    corresponds to a hexadecimal-digit character (as defined
358        #    in 6.4.4.1).
359        # 6.4.4.1:
360        #    hexadecimal-digit: one of
361        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
362        return ((code_point >= 0x0030 and code_point  <= 0x0039)
363                or (code_point >= 0x0041 and code_point <= 0x0046)
364                or (code_point >= 0x0061 and code_point <= 0x0066))
365
366def is_graph(code_point):
367    '''Checks whether the character with this code point is
368    a graphical character'''
369    return (UNICODE_ATTRIBUTES[code_point]['name']
370            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
371            and not is_space(code_point))
372
373def is_print(code_point):
374    '''Checks whether the character with this code point is printable'''
375    return (UNICODE_ATTRIBUTES[code_point]['name']
376            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
377            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
378
379def is_punct(code_point):
380    '''Checks whether the character with this code point is punctuation'''
381    if False:
382        return (UNICODE_ATTRIBUTES[code_point]['name']
383                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
384    else:
385        # The traditional POSIX definition of punctuation is every graphic,
386        # non-alphanumeric character.
387        return (is_graph(code_point)
388                and not is_alpha(code_point)
389                and not is_digit(code_point))
390
391def is_combining(code_point):
392    '''Checks whether the character with this code point is
393    a combining character'''
394    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
395    # file. In 3.0.1 it was identical to the union of the general categories
396    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
397    # PropList.txt file, so we take the latter definition.
398    return (UNICODE_ATTRIBUTES[code_point]['name']
399            and
400            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
401
402def is_combining_level3(code_point):
403    '''Checks whether the character with this code point is
404    a combining level3 character'''
405    return (is_combining(code_point)
406            and
407            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
408
409def ucs_symbol(code_point):
410    '''Return the UCS symbol string for a Unicode character.'''
411    if code_point < 0x10000:
412        return '<U{:04X}>'.format(code_point)
413    else:
414        return '<U{:08X}>'.format(code_point)
415
416def ucs_symbol_range(code_point_low, code_point_high):
417    '''Returns a string UCS symbol string for a code point range.
418
419    Example:
420
421    <U0041>..<U005A>
422    '''
423    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
424
425def verifications():
426    '''Tests whether the is_* functions observe the known restrictions'''
427    for code_point in sorted(UNICODE_ATTRIBUTES):
428        # toupper restriction: "Only characters specified for the keywords
429        # lower and upper shall be specified.
430        if (to_upper(code_point) != code_point
431            and not (is_lower(code_point) or is_upper(code_point))):
432            sys.stderr.write(
433                ('%(sym)s is not upper|lower '
434                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
435                    'sym': ucs_symbol(code_point),
436                    'c': code_point,
437                    'uc': to_upper(code_point)})
438        # tolower restriction: "Only characters specified for the keywords
439        # lower and upper shall be specified.
440        if (to_lower(code_point) != code_point
441            and not (is_lower(code_point) or is_upper(code_point))):
442            sys.stderr.write(
443                ('%(sym)s is not upper|lower '
444                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
445                    'sym': ucs_symbol(code_point),
446                    'c': code_point,
447                    'uc': to_lower(code_point)})
448        # alpha restriction: "Characters classified as either upper or lower
449        # shall automatically belong to this class.
450        if ((is_lower(code_point) or is_upper(code_point))
451             and not is_alpha(code_point)):
452            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
453                'sym': ucs_symbol(code_point)})
454        # alpha restriction: “No character specified for the keywords cntrl,
455        # digit, punct or space shall be specified.”
456        if (is_alpha(code_point) and is_cntrl(code_point)):
457            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
458                'sym': ucs_symbol(code_point)})
459        if (is_alpha(code_point) and is_digit(code_point)):
460            sys.stderr.write('%(sym)s is alpha and digit\n' %{
461                'sym': ucs_symbol(code_point)})
462        if (is_alpha(code_point) and is_punct(code_point)):
463            sys.stderr.write('%(sym)s is alpha and punct\n' %{
464                'sym': ucs_symbol(code_point)})
465        if (is_alpha(code_point) and is_space(code_point)):
466            sys.stderr.write('%(sym)s is alpha and space\n' %{
467                'sym': ucs_symbol(code_point)})
468        # space restriction: “No character specified for the keywords upper,
469        # lower, alpha, digit, graph or xdigit shall be specified.”
470        # upper, lower, alpha already checked above.
471        if (is_space(code_point) and is_digit(code_point)):
472            sys.stderr.write('%(sym)s is space and digit\n' %{
473                'sym': ucs_symbol(code_point)})
474        if (is_space(code_point) and is_graph(code_point)):
475            sys.stderr.write('%(sym)s is space and graph\n' %{
476                'sym': ucs_symbol(code_point)})
477        if (is_space(code_point) and is_xdigit(code_point)):
478            sys.stderr.write('%(sym)s is space and xdigit\n' %{
479                'sym': ucs_symbol(code_point)})
480        # cntrl restriction: “No character specified for the keywords upper,
481        # lower, alpha, digit, punct, graph, print or xdigit shall be
482        # specified.”  upper, lower, alpha already checked above.
483        if (is_cntrl(code_point) and is_digit(code_point)):
484            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
485                'sym': ucs_symbol(code_point)})
486        if (is_cntrl(code_point) and is_punct(code_point)):
487            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
488                'sym': ucs_symbol(code_point)})
489        if (is_cntrl(code_point) and is_graph(code_point)):
490            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
491                'sym': ucs_symbol(code_point)})
492        if (is_cntrl(code_point) and is_print(code_point)):
493            sys.stderr.write('%(sym)s is cntrl and print\n' %{
494                'sym': ucs_symbol(code_point)})
495        if (is_cntrl(code_point) and is_xdigit(code_point)):
496            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
497                'sym': ucs_symbol(code_point)})
498        # punct restriction: “No character specified for the keywords upper,
499        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
500        # be specified.”  upper, lower, alpha, cntrl already checked above.
501        if (is_punct(code_point) and is_digit(code_point)):
502            sys.stderr.write('%(sym)s is punct and digit\n' %{
503                'sym': ucs_symbol(code_point)})
504        if (is_punct(code_point) and is_xdigit(code_point)):
505            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
506                'sym': ucs_symbol(code_point)})
507        if (is_punct(code_point) and code_point == 0x0020):
508            sys.stderr.write('%(sym)s is punct\n' %{
509                'sym': ucs_symbol(code_point)})
510        # graph restriction: “No character specified for the keyword cntrl
511        # shall be specified.”  Already checked above.
512
513        # print restriction: “No character specified for the keyword cntrl
514        # shall be specified.”  Already checked above.
515
516        # graph - print relation: differ only in the <space> character.
517        # How is this possible if there are more than one space character?!
518        # I think susv2/xbd/locale.html should speak of “space characters”,
519        # not “space character”.
520        if (is_print(code_point)
521            and not (is_graph(code_point) or is_space(code_point))):
522            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
523                'sym': unicode_utils.ucs_symbol(code_point)})
524        if (not is_print(code_point)
525            and (is_graph(code_point) or code_point == 0x0020)):
526            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
527                'sym': unicode_utils.ucs_symbol(code_point)})
528