1#!/usr/bin/python3
2#
3# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
4# Copyright (C) 2014-2022 Free Software Foundation, Inc.
5# This file is part of the GNU C Library.
6#
7# The GNU C Library is free software; you can redistribute it and/or
8# modify it under the terms of the GNU Lesser General Public
9# License as published by the Free Software Foundation; either
10# version 2.1 of the License, or (at your option) any later version.
11#
12# The GNU C Library is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15# Lesser General Public License for more details.
16#
17# You should have received a copy of the GNU Lesser General Public
18# License along with the GNU C Library; if not, see
19# <https://www.gnu.org/licenses/>.
20
21'''
22Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
23DerivedCoreProperties.txt files.
24
25To see how this script is used, call it with the “-h” option:
26
27    $ ./gen_unicode_ctype.py -h
28    … prints usage message …
29'''
30
31import argparse
32import time
33import re
34import unicode_utils
35
36def code_point_ranges(is_class_function):
37    '''Returns a list of ranges of code points for which is_class_function
38    returns True.
39
40    Example:
41
42    [[65, 90], [192, 214], [216, 222], [256], … ]
43    '''
44    cp_ranges  = []
45    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
46        if is_class_function(code_point):
47            if (cp_ranges
48                and cp_ranges[-1][-1] == code_point - 1):
49                if len(cp_ranges[-1]) == 1:
50                    cp_ranges[-1].append(code_point)
51                else:
52                    cp_ranges[-1][-1] = code_point
53            else:
54                cp_ranges.append([code_point])
55    return cp_ranges
56
57def output_charclass(i18n_file, class_name, is_class_function):
58    '''Output a LC_CTYPE character class section
59
60    Example:
61
62    upper /
63       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
6465       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
66       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
67    '''
68    cp_ranges = code_point_ranges(is_class_function)
69    if cp_ranges:
70        i18n_file.write('%s /\n' %class_name)
71        max_column = 75
72        prefix = '   '
73        line = prefix
74        range_string = ''
75        for code_point_range in cp_ranges:
76            if line.strip():
77                line  += ';'
78            if len(code_point_range) == 1:
79                range_string = unicode_utils.ucs_symbol(code_point_range[0])
80            else:
81                range_string = unicode_utils.ucs_symbol_range(
82                    code_point_range[0], code_point_range[-1])
83            if len(line+range_string) > max_column:
84                i18n_file.write(line+'/\n')
85                line = prefix
86            line += range_string
87        if line.strip():
88            i18n_file.write(line+'\n')
89        i18n_file.write('\n')
90
91def output_charmap(i18n_file, map_name, map_function):
92    '''Output a LC_CTYPE character map section
93
94    Example:
95
96    toupper /
97      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
9899      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
100      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
101    '''
102    max_column = 75
103    prefix = '   '
104    line = prefix
105    map_string = ''
106    i18n_file.write('%s /\n' %map_name)
107    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
108        mapped = map_function(code_point)
109        if code_point != mapped:
110            if line.strip():
111                line += ';'
112            map_string = '(' \
113                         + unicode_utils.ucs_symbol(code_point) \
114                         + ',' \
115                         + unicode_utils.ucs_symbol(mapped) \
116                         + ')'
117            if len(line+map_string) > max_column:
118                i18n_file.write(line+'/\n')
119                line = prefix
120            line += map_string
121    if line.strip():
122        i18n_file.write(line+'\n')
123    i18n_file.write('\n')
124
125def read_input_file(filename):
126    '''Reads the original glibc i18n file to get the original head
127    and tail.
128
129    We want to replace only the character classes in LC_CTYPE, and the
130    date stamp. All the rest of the i18n file should stay unchanged.
131    To avoid having to cut and paste the generated data into the
132    original file, it is helpful to read the original file here
133    to be able to generate a complete result file.
134    '''
135    head = tail = ''
136    with open(filename, mode='r') as i18n_file:
137        for line in i18n_file:
138            match = re.match(
139                r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
140                line)
141            if match:
142                line = match.group('key') \
143                       + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
144            head = head + line
145            if line.startswith('LC_CTYPE'):
146                break
147        for line in i18n_file:
148            if line.startswith('translit_start'):
149                tail = line
150                break
151        for line in i18n_file:
152            tail = tail + line
153    return (head, tail)
154
155def output_head(i18n_file, unicode_version, head=''):
156    '''Write the header of the output file, i.e. the part of the file
157    before the “LC_CTYPE” line.
158    '''
159    if ARGS.input_file and head:
160        i18n_file.write(head)
161    else:
162        i18n_file.write('escape_char /\n')
163        i18n_file.write('comment_char %\n')
164        i18n_file.write('\n')
165        i18n_file.write('% Generated automatically by '
166                        + 'gen_unicode_ctype.py '
167                        + 'for Unicode {:s}.\n'.format(unicode_version))
168        i18n_file.write('\n')
169        i18n_file.write('LC_IDENTIFICATION\n')
170        i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
171            unicode_version))
172        i18n_file.write('source    "UnicodeData.txt, '
173                        + 'DerivedCoreProperties.txt"\n')
174        i18n_file.write('address   ""\n')
175        i18n_file.write('contact   ""\n')
176        i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
177        i18n_file.write('tel       ""\n')
178        i18n_file.write('fax       ""\n')
179        i18n_file.write('language  ""\n')
180        i18n_file.write('territory "Earth"\n')
181        i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
182        i18n_file.write('date      "{:s}"\n'.format(
183            time.strftime('%Y-%m-%d')))
184        i18n_file.write('category  "i18n:2012";LC_CTYPE\n')
185        i18n_file.write('END LC_IDENTIFICATION\n')
186        i18n_file.write('\n')
187        i18n_file.write('LC_CTYPE\n')
188
189def output_tail(i18n_file, tail=''):
190    '''Write the tail of the output file, i.e. the part of the file
191    after the last “LC_CTYPE” character class.
192    '''
193    if ARGS.input_file and tail:
194        i18n_file.write(tail)
195    else:
196        i18n_file.write('END LC_CTYPE\n')
197
198def output_tables(i18n_file, unicode_version, turkish):
199    '''Write the new LC_CTYPE character classes to the output file'''
200    i18n_file.write('% The following is the 14652 i18n fdcc-set '
201                    + 'LC_CTYPE category.\n')
202    i18n_file.write('% It covers Unicode version {:s}.\n'.format(
203        unicode_version))
204    i18n_file.write('% The character classes and mapping tables were '
205                    + 'automatically\n')
206    i18n_file.write('% generated using the gen_unicode_ctype.py '
207                    + 'program.\n\n')
208    i18n_file.write('% The "upper" class reflects the uppercase '
209                    + 'characters of class "alpha"\n')
210    output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
211    i18n_file.write('% The "lower" class reflects the lowercase '
212                    + 'characters of class "alpha"\n')
213    output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
214    i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
215                    + 'reflecting\n')
216    i18n_file.write('% the recommendations in TR 10176 annex A\n')
217    output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
218    i18n_file.write('% The "digit" class must only contain the '
219                    + 'BASIC LATIN digits, says ISO C 99\n')
220    i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
221    output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
222    i18n_file.write('% The "outdigit" information is by default '
223                    + '"0" to "9".  We don\'t have to\n')
224    i18n_file.write('% provide it here since localedef will fill '
225               + 'in the bits and it would\n')
226    i18n_file.write('% prevent locales copying this file define '
227                    + 'their own values.\n')
228    i18n_file.write('% outdigit /\n')
229    i18n_file.write('%    <U0030>..<U0039>\n\n')
230    # output_charclass(i18n_file, 'outdigit', is_outdigit)
231    output_charclass(i18n_file, 'space', unicode_utils.is_space)
232    output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
233    output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
234    output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
235    output_charclass(i18n_file, 'print', unicode_utils.is_print)
236    i18n_file.write('% The "xdigit" class must only contain the '
237                    + 'BASIC LATIN digits and A-F, a-f,\n')
238    i18n_file.write('% says ISO C 99 '
239                    + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
240    output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
241    output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
242    if turkish:
243        i18n_file.write('% The case conversions reflect '
244                        + 'Turkish conventions.\n')
245        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
246        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
247    else:
248        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
249        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
250    output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
251    i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
252                    + 'annex B.1\n')
253    i18n_file.write('% That is, all combining characters (level 2+3).\n')
254    output_charclass(i18n_file, 'class "combining";',
255                     unicode_utils.is_combining)
256    i18n_file.write('% The "combining_level3" class reflects '
257                    + 'ISO/IEC 10646-1 annex B.2\n')
258    i18n_file.write('% That is, combining characters of level 3.\n')
259    output_charclass(i18n_file, 'class "combining_level3";',
260                     unicode_utils.is_combining_level3)
261
262if __name__ == "__main__":
263    PARSER = argparse.ArgumentParser(
264        description='''
265        Generate a Unicode conforming LC_CTYPE category from
266        UnicodeData.txt and DerivedCoreProperties.txt files.
267        ''')
268    PARSER.add_argument(
269        '-u', '--unicode_data_file',
270        nargs='?',
271        type=str,
272        default='UnicodeData.txt',
273        help=('The UnicodeData.txt file to read, '
274              + 'default: %(default)s'))
275    PARSER.add_argument(
276        '-d', '--derived_core_properties_file',
277        nargs='?',
278        type=str,
279        default='DerivedCoreProperties.txt',
280        help=('The DerivedCoreProperties.txt file to read, '
281              + 'default: %(default)s'))
282    PARSER.add_argument(
283        '-i', '--input_file',
284        nargs='?',
285        type=str,
286        help='''The original glibc/localedata/locales/i18n file.''')
287    PARSER.add_argument(
288        '-o', '--output_file',
289        nargs='?',
290        type=str,
291        default='i18n.new',
292        help='''The file which shall contain the generated LC_CTYPE category,
293        default: %(default)s.  If the original
294        glibc/localedata/locales/i18n has been given
295        as an option, all data from the original file
296        except the newly generated LC_CTYPE character
297        classes and the date stamp in
298        LC_IDENTIFICATION will be copied unchanged
299        into the output file.  ''')
300    PARSER.add_argument(
301        '--unicode_version',
302        nargs='?',
303        required=True,
304        type=str,
305        help='The Unicode version of the input files used.')
306    PARSER.add_argument(
307        '--turkish',
308        action='store_true',
309        help='Use Turkish case conversions.')
310    ARGS = PARSER.parse_args()
311
312    unicode_utils.fill_attributes(
313        ARGS.unicode_data_file)
314    unicode_utils.fill_derived_core_properties(
315        ARGS.derived_core_properties_file)
316    unicode_utils.verifications()
317    HEAD = TAIL = ''
318    if ARGS.input_file:
319        (HEAD, TAIL) = read_input_file(ARGS.input_file)
320    with open(ARGS.output_file, mode='w') as I18N_FILE:
321        output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
322        output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
323        output_tail(I18N_FILE, tail=TAIL)
324