1#!/usr/bin/python3 2# 3# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file. 4# Copyright (C) 2014-2022 Free Software Foundation, Inc. 5# This file is part of the GNU C Library. 6# 7# The GNU C Library is free software; you can redistribute it and/or 8# modify it under the terms of the GNU Lesser General Public 9# License as published by the Free Software Foundation; either 10# version 2.1 of the License, or (at your option) any later version. 11# 12# The GNU C Library is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15# Lesser General Public License for more details. 16# 17# You should have received a copy of the GNU Lesser General Public 18# License along with the GNU C Library; if not, see 19# <https://www.gnu.org/licenses/>. 20 21''' 22Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and 23DerivedCoreProperties.txt files. 24 25To see how this script is used, call it with the “-h” option: 26 27 $ ./gen_unicode_ctype.py -h 28 … prints usage message … 29''' 30 31import argparse 32import time 33import re 34import unicode_utils 35 36def code_point_ranges(is_class_function): 37 '''Returns a list of ranges of code points for which is_class_function 38 returns True. 39 40 Example: 41 42 [[65, 90], [192, 214], [216, 222], [256], … ] 43 ''' 44 cp_ranges = [] 45 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): 46 if is_class_function(code_point): 47 if (cp_ranges 48 and cp_ranges[-1][-1] == code_point - 1): 49 if len(cp_ranges[-1]) == 1: 50 cp_ranges[-1].append(code_point) 51 else: 52 cp_ranges[-1][-1] = code_point 53 else: 54 cp_ranges.append([code_point]) 55 return cp_ranges 56 57def output_charclass(i18n_file, class_name, is_class_function): 58 '''Output a LC_CTYPE character class section 59 60 Example: 61 62 upper / 63 <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/ 64 … 65 <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/ 66 <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189> 67 ''' 68 cp_ranges = code_point_ranges(is_class_function) 69 if cp_ranges: 70 i18n_file.write('%s /\n' %class_name) 71 max_column = 75 72 prefix = ' ' 73 line = prefix 74 range_string = '' 75 for code_point_range in cp_ranges: 76 if line.strip(): 77 line += ';' 78 if len(code_point_range) == 1: 79 range_string = unicode_utils.ucs_symbol(code_point_range[0]) 80 else: 81 range_string = unicode_utils.ucs_symbol_range( 82 code_point_range[0], code_point_range[-1]) 83 if len(line+range_string) > max_column: 84 i18n_file.write(line+'/\n') 85 line = prefix 86 line += range_string 87 if line.strip(): 88 i18n_file.write(line+'\n') 89 i18n_file.write('\n') 90 91def output_charmap(i18n_file, map_name, map_function): 92 '''Output a LC_CTYPE character map section 93 94 Example: 95 96 toupper / 97 (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/ 98 … 99 (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/ 100 (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>) 101 ''' 102 max_column = 75 103 prefix = ' ' 104 line = prefix 105 map_string = '' 106 i18n_file.write('%s /\n' %map_name) 107 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): 108 mapped = map_function(code_point) 109 if code_point != mapped: 110 if line.strip(): 111 line += ';' 112 map_string = '(' \ 113 + unicode_utils.ucs_symbol(code_point) \ 114 + ',' \ 115 + unicode_utils.ucs_symbol(mapped) \ 116 + ')' 117 if len(line+map_string) > max_column: 118 i18n_file.write(line+'/\n') 119 line = prefix 120 line += map_string 121 if line.strip(): 122 i18n_file.write(line+'\n') 123 i18n_file.write('\n') 124 125def read_input_file(filename): 126 '''Reads the original glibc i18n file to get the original head 127 and tail. 128 129 We want to replace only the character classes in LC_CTYPE, and the 130 date stamp. All the rest of the i18n file should stay unchanged. 131 To avoid having to cut and paste the generated data into the 132 original file, it is helpful to read the original file here 133 to be able to generate a complete result file. 134 ''' 135 head = tail = '' 136 with open(filename, mode='r') as i18n_file: 137 for line in i18n_file: 138 match = re.match( 139 r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")', 140 line) 141 if match: 142 line = match.group('key') \ 143 + '"{:s}"\n'.format(time.strftime('%Y-%m-%d')) 144 head = head + line 145 if line.startswith('LC_CTYPE'): 146 break 147 for line in i18n_file: 148 if line.startswith('translit_start'): 149 tail = line 150 break 151 for line in i18n_file: 152 tail = tail + line 153 return (head, tail) 154 155def output_head(i18n_file, unicode_version, head=''): 156 '''Write the header of the output file, i.e. the part of the file 157 before the “LC_CTYPE” line. 158 ''' 159 if ARGS.input_file and head: 160 i18n_file.write(head) 161 else: 162 i18n_file.write('escape_char /\n') 163 i18n_file.write('comment_char %\n') 164 i18n_file.write('\n') 165 i18n_file.write('% Generated automatically by ' 166 + 'gen_unicode_ctype.py ' 167 + 'for Unicode {:s}.\n'.format(unicode_version)) 168 i18n_file.write('\n') 169 i18n_file.write('LC_IDENTIFICATION\n') 170 i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format( 171 unicode_version)) 172 i18n_file.write('source "UnicodeData.txt, ' 173 + 'DerivedCoreProperties.txt"\n') 174 i18n_file.write('address ""\n') 175 i18n_file.write('contact ""\n') 176 i18n_file.write('email "bug-glibc-locales@gnu.org"\n') 177 i18n_file.write('tel ""\n') 178 i18n_file.write('fax ""\n') 179 i18n_file.write('language ""\n') 180 i18n_file.write('territory "Earth"\n') 181 i18n_file.write('revision "{:s}"\n'.format(unicode_version)) 182 i18n_file.write('date "{:s}"\n'.format( 183 time.strftime('%Y-%m-%d'))) 184 i18n_file.write('category "i18n:2012";LC_CTYPE\n') 185 i18n_file.write('END LC_IDENTIFICATION\n') 186 i18n_file.write('\n') 187 i18n_file.write('LC_CTYPE\n') 188 189def output_tail(i18n_file, tail=''): 190 '''Write the tail of the output file, i.e. the part of the file 191 after the last “LC_CTYPE” character class. 192 ''' 193 if ARGS.input_file and tail: 194 i18n_file.write(tail) 195 else: 196 i18n_file.write('END LC_CTYPE\n') 197 198def output_tables(i18n_file, unicode_version, turkish): 199 '''Write the new LC_CTYPE character classes to the output file''' 200 i18n_file.write('% The following is the 14652 i18n fdcc-set ' 201 + 'LC_CTYPE category.\n') 202 i18n_file.write('% It covers Unicode version {:s}.\n'.format( 203 unicode_version)) 204 i18n_file.write('% The character classes and mapping tables were ' 205 + 'automatically\n') 206 i18n_file.write('% generated using the gen_unicode_ctype.py ' 207 + 'program.\n\n') 208 i18n_file.write('% The "upper" class reflects the uppercase ' 209 + 'characters of class "alpha"\n') 210 output_charclass(i18n_file, 'upper', unicode_utils.is_upper) 211 i18n_file.write('% The "lower" class reflects the lowercase ' 212 + 'characters of class "alpha"\n') 213 output_charclass(i18n_file, 'lower', unicode_utils.is_lower) 214 i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' 215 + 'reflecting\n') 216 i18n_file.write('% the recommendations in TR 10176 annex A\n') 217 output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha) 218 i18n_file.write('% The "digit" class must only contain the ' 219 + 'BASIC LATIN digits, says ISO C 99\n') 220 i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') 221 output_charclass(i18n_file, 'digit', unicode_utils.is_digit) 222 i18n_file.write('% The "outdigit" information is by default ' 223 + '"0" to "9". We don\'t have to\n') 224 i18n_file.write('% provide it here since localedef will fill ' 225 + 'in the bits and it would\n') 226 i18n_file.write('% prevent locales copying this file define ' 227 + 'their own values.\n') 228 i18n_file.write('% outdigit /\n') 229 i18n_file.write('% <U0030>..<U0039>\n\n') 230 # output_charclass(i18n_file, 'outdigit', is_outdigit) 231 output_charclass(i18n_file, 'space', unicode_utils.is_space) 232 output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl) 233 output_charclass(i18n_file, 'punct', unicode_utils.is_punct) 234 output_charclass(i18n_file, 'graph', unicode_utils.is_graph) 235 output_charclass(i18n_file, 'print', unicode_utils.is_print) 236 i18n_file.write('% The "xdigit" class must only contain the ' 237 + 'BASIC LATIN digits and A-F, a-f,\n') 238 i18n_file.write('% says ISO C 99 ' 239 + '(sections 7.25.2.1.12 and 6.4.4.1).\n') 240 output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit) 241 output_charclass(i18n_file, 'blank', unicode_utils.is_blank) 242 if turkish: 243 i18n_file.write('% The case conversions reflect ' 244 + 'Turkish conventions.\n') 245 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish) 246 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish) 247 else: 248 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper) 249 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower) 250 output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title) 251 i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' 252 + 'annex B.1\n') 253 i18n_file.write('% That is, all combining characters (level 2+3).\n') 254 output_charclass(i18n_file, 'class "combining";', 255 unicode_utils.is_combining) 256 i18n_file.write('% The "combining_level3" class reflects ' 257 + 'ISO/IEC 10646-1 annex B.2\n') 258 i18n_file.write('% That is, combining characters of level 3.\n') 259 output_charclass(i18n_file, 'class "combining_level3";', 260 unicode_utils.is_combining_level3) 261 262if __name__ == "__main__": 263 PARSER = argparse.ArgumentParser( 264 description=''' 265 Generate a Unicode conforming LC_CTYPE category from 266 UnicodeData.txt and DerivedCoreProperties.txt files. 267 ''') 268 PARSER.add_argument( 269 '-u', '--unicode_data_file', 270 nargs='?', 271 type=str, 272 default='UnicodeData.txt', 273 help=('The UnicodeData.txt file to read, ' 274 + 'default: %(default)s')) 275 PARSER.add_argument( 276 '-d', '--derived_core_properties_file', 277 nargs='?', 278 type=str, 279 default='DerivedCoreProperties.txt', 280 help=('The DerivedCoreProperties.txt file to read, ' 281 + 'default: %(default)s')) 282 PARSER.add_argument( 283 '-i', '--input_file', 284 nargs='?', 285 type=str, 286 help='''The original glibc/localedata/locales/i18n file.''') 287 PARSER.add_argument( 288 '-o', '--output_file', 289 nargs='?', 290 type=str, 291 default='i18n.new', 292 help='''The file which shall contain the generated LC_CTYPE category, 293 default: %(default)s. If the original 294 glibc/localedata/locales/i18n has been given 295 as an option, all data from the original file 296 except the newly generated LC_CTYPE character 297 classes and the date stamp in 298 LC_IDENTIFICATION will be copied unchanged 299 into the output file. ''') 300 PARSER.add_argument( 301 '--unicode_version', 302 nargs='?', 303 required=True, 304 type=str, 305 help='The Unicode version of the input files used.') 306 PARSER.add_argument( 307 '--turkish', 308 action='store_true', 309 help='Use Turkish case conversions.') 310 ARGS = PARSER.parse_args() 311 312 unicode_utils.fill_attributes( 313 ARGS.unicode_data_file) 314 unicode_utils.fill_derived_core_properties( 315 ARGS.derived_core_properties_file) 316 unicode_utils.verifications() 317 HEAD = TAIL = '' 318 if ARGS.input_file: 319 (HEAD, TAIL) = read_input_file(ARGS.input_file) 320 with open(ARGS.output_file, mode='w') as I18N_FILE: 321 output_head(I18N_FILE, ARGS.unicode_version, head=HEAD) 322 output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish) 323 output_tail(I18N_FILE, tail=TAIL) 324