1#!/usr/bin/python3 2# -*- coding: utf-8 -*- 3# 4# Generate a translit_compat file from a UnicodeData file. 5# Copyright (C) 2015-2022 Free Software Foundation, Inc. 6# This file is part of the GNU C Library. 7# 8# The GNU C Library is free software; you can redistribute it and/or 9# modify it under the terms of the GNU Lesser General Public 10# License as published by the Free Software Foundation; either 11# version 2.1 of the License, or (at your option) any later version. 12# 13# The GNU C Library is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16# Lesser General Public License for more details. 17# 18# You should have received a copy of the GNU Lesser General Public 19# License along with the GNU C Library; if not, see 20# <https://www.gnu.org/licenses/>. 21 22''' 23Generate a translit_compat file from UnicodeData.txt 24 25To see how this script is used, call it with the “-h” option: 26 27 $ ./gen_translit_compat -h 28 … prints usage message … 29''' 30 31import argparse 32import time 33import unicode_utils 34 35def read_input_file(filename): 36 '''Reads the original glibc translit_compat file to get the 37 original head and tail. 38 39 We want to replace only the part of the file between 40 “translit_start” and “translit_end” 41 ''' 42 head = tail = '' 43 with open(filename, mode='r') as translit_file: 44 for line in translit_file: 45 head = head + line 46 if line.startswith('translit_start'): 47 break 48 for line in translit_file: 49 if line.startswith('translit_end'): 50 tail = line 51 break 52 for line in translit_file: 53 tail = tail + line 54 return (head, tail) 55 56def output_head(translit_file, unicode_version, head=''): 57 '''Write the header of the output file, i.e. the part of the file 58 before the “translit_start” line. 59 ''' 60 if ARGS.input_file and head: 61 translit_file.write(head) 62 else: 63 translit_file.write('escape_char /\n') 64 translit_file.write('comment_char %\n') 65 translit_file.write(unicode_utils.COMMENT_HEADER) 66 translit_file.write('\n') 67 translit_file.write('% Transliterations of compatibility characters ') 68 translit_file.write('and ligatures.\n') 69 translit_file.write('% Generated automatically from UnicodeData.txt ' 70 + 'by gen_translit_compat.py ' 71 + 'on {:s} '.format(time.strftime('%Y-%m-%d')) 72 + 'for Unicode {:s}.\n'.format(unicode_version)) 73 translit_file.write('\n') 74 translit_file.write('LC_CTYPE\n') 75 translit_file.write('\n') 76 translit_file.write('translit_start\n') 77 78def output_tail(translit_file, tail=''): 79 '''Write the tail of the output file''' 80 if ARGS.input_file and tail: 81 translit_file.write(tail) 82 else: 83 translit_file.write('translit_end\n') 84 translit_file.write('\n') 85 translit_file.write('END LC_CTYPE\n') 86 87def compatibility_decompose(code_point): 88 '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings 89 90 “The compatibility decomposition is formed by recursively applying 91 the canonical and compatibility mappings, then applying the 92 Canonical Ordering Algorithm.” 93 94 We don’t do the canonical decomposition here because this is 95 done in gen_translit_combining.py to generate translit_combining. 96 97 And we ignore some of the possible compatibility formatting tags 98 here. Some of them are used in other translit_* files, not 99 translit_compat: 100 101 <font>: translit_font 102 <circle>: translit_circle 103 <wide>: translit_wide 104 <narrow>: translit_narrow 105 <square>: translit_cjk_compat 106 <fraction>: translit_fraction 107 108 And we ignore 109 110 <noBreak>, <initial>, <medial>, <final>, <isolated> 111 112 because they seem to be not useful for transliteration. 113 ''' 114 decomposition = unicode_utils.UNICODE_ATTRIBUTES[ 115 code_point]['decomposition'] 116 compatibility_tags = ( 117 '<compat>', '<super>', '<sub>', '<vertical>') 118 for compatibility_tag in compatibility_tags: 119 if decomposition.startswith(compatibility_tag): 120 decomposition = decomposition[len(compatibility_tag)+1:] 121 decomposed_code_points = [int(x, 16) 122 for x in decomposition.split(' ')] 123 if (len(decomposed_code_points) > 1 124 and decomposed_code_points[0] == 0x0020 125 and decomposed_code_points[1] >= 0x0300 126 and decomposed_code_points[1] <= 0x03FF): 127 # Decomposes into a space followed by a combining character. 128 # This is not useful fo transliteration. 129 return [] 130 else: 131 return_value = [] 132 for index in range(0, len(decomposed_code_points)): 133 cd_code_points = compatibility_decompose( 134 decomposed_code_points[index]) 135 if cd_code_points: 136 return_value += cd_code_points 137 else: 138 return_value += [decomposed_code_points[index]] 139 return return_value 140 return [] 141 142def special_decompose(code_point_list): 143 ''' 144 Decompositions which are not in UnicodeData.txt at all but which 145 were used in the original translit_compat file in glibc and 146 which seem to make sense. I want to keep the update of 147 translit_compat close to the spirit of the original file, 148 therefore I added this special decomposition rules here. 149 ''' 150 special_decompose_dict = { 151 (0x03BC,): [0x0075], # μ → u 152 (0x02BC,): [0x0027], # ʼ → ' 153 } 154 if tuple(code_point_list) in special_decompose_dict: 155 return special_decompose_dict[tuple(code_point_list)] 156 else: 157 return code_point_list 158 159def special_ligature_decompose(code_point): 160 ''' 161 Decompositions for ligatures which are not in UnicodeData.txt at 162 all but which were used in the original translit_compat file in 163 glibc and which seem to make sense. I want to keep the update of 164 translit_compat close to the spirit of the original file, 165 therefore I added these special ligature decomposition rules here. 166 167 ''' 168 special_ligature_decompose_dict = { 169 0x00E6: [0x0061, 0x0065], # æ → ae 170 0x00C6: [0x0041, 0x0045], # Æ → AE 171 # These following 5 special ligature decompositions were 172 # in the original glibc/localedata/locales/translit_compat file 173 0x0152: [0x004F, 0x0045], # Œ → OE 174 0x0153: [0x006F, 0x0065], # œ → oe 175 0x05F0: [0x05D5, 0x05D5], # װ → וו 176 0x05F1: [0x05D5, 0x05D9], # ױ → וי 177 0x05F2: [0x05D9, 0x05D9], # ײ → יי 178 # The following special ligature decompositions were 179 # not in the original glibc/localedata/locales/translit_compat file 180 # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE 181 # → U+041D CYRILLIC CAPITAL LETTER EN, 182 # U+0413 CYRILLIC CAPITAL LETTER GHE 183 0x04A4: [0x041D, 0x0413], # Ҥ → НГ 184 # U+04A5 CYRILLIC SMALL LIGATURE EN GHE 185 # → U+043D CYRILLIC SMALL LETTER EN, 186 # U+0433 CYRILLIC SMALL LETTER GHE 187 0x04A5: [0x043D, 0x0433], # ҥ → нг 188 # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE 189 # → U+0422 CYRILLIC CAPITAL LETTER TE, 190 # U+0426 CYRILLIC CAPITAL LETTER TSE 191 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ 192 # U+04B5 CYRILLIC SMALL LIGATURE TE TSE 193 # → U+0442 CYRILLIC SMALL LETTER TE, 194 # U+0446 CYRILLIC SMALL LETTER TSE 195 0x04B5: [0x0442, 0x0446], # ҵ → тц 196 # U+04d4 CYRILLIC CAPITAL LIGATURE A IE 197 # → U+0410 CYRILLIC CAPITAL LETTER A 198 # U+0415;CYRILLIC CAPITAL LETTER IE 199 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ 200 # U+04D5 CYRILLIC SMALL LIGATURE A IE 201 # → U+0430 CYRILLIC SMALL LETTER A, 202 # U+0435 CYRILLIC SMALL LETTER IE 203 0x04D5: [0x0430, 0x0435], # ӕ → ае 204 # I am not sure what to do with the following ligatures 205 # maybe it makes no sense to decompose them: 206 # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH 207 # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA 208 # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA 209 # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM 210 # U+fe20 COMBINING LIGATURE LEFT HALF 211 # U+fe21 COMBINING LIGATURE RIGHT HALF 212 # U+fe27 COMBINING LIGATURE LEFT HALF BELOW 213 # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW 214 # U+11176 MAHAJANI LIGATURE SHRI 215 # U+1f670 SCRIPT LIGATURE ET ORNAMENT 216 # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT 217 # U+1f672 LIGATURE OPEN ET ORNAMENT 218 # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT 219 } 220 if code_point in special_ligature_decompose_dict: 221 return special_ligature_decompose_dict[code_point] 222 else: 223 return [code_point] 224 225def output_transliteration(translit_file): 226 '''Write the new transliteration to the output file''' 227 translit_file.write('\n') 228 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): 229 name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] 230 decomposed_code_points = [compatibility_decompose(code_point)] 231 if not decomposed_code_points[0]: 232 if special_decompose([code_point]) != [code_point]: 233 decomposed_code_points[0] = special_decompose([code_point]) 234 else: 235 special_decomposed_code_points = [] 236 while True: 237 special_decomposed_code_points = special_decompose( 238 decomposed_code_points[-1]) 239 if (special_decomposed_code_points 240 != decomposed_code_points[-1]): 241 decomposed_code_points.append( 242 special_decomposed_code_points) 243 continue 244 special_decomposed_code_points = [] 245 for decomposed_code_point in decomposed_code_points[-1]: 246 special_decomposed_code_points += special_decompose( 247 [decomposed_code_point]) 248 if (special_decomposed_code_points 249 == decomposed_code_points[-1]): 250 break 251 decomposed_code_points.append( 252 special_decomposed_code_points) 253 if decomposed_code_points[0]: 254 translit_file.write('% {:s}\n'.format(name)) 255 translit_file.write('{:s} '.format( 256 unicode_utils.ucs_symbol(code_point))) 257 for index in range(0, len(decomposed_code_points)): 258 if index > 0: 259 translit_file.write(';') 260 translit_file.write('"') 261 for decomposed_code_point in decomposed_code_points[index]: 262 translit_file.write('{:s}'.format( 263 unicode_utils.ucs_symbol(decomposed_code_point))) 264 translit_file.write('"') 265 translit_file.write('\n') 266 elif 'LIGATURE' in name and 'ARABIC' not in name: 267 decomposed_code_points = special_ligature_decompose(code_point) 268 if decomposed_code_points[0] != code_point: 269 translit_file.write('% {:s}\n'.format(name)) 270 translit_file.write('{:s} '.format( 271 unicode_utils.ucs_symbol(code_point))) 272 translit_file.write('"') 273 for decomposed_code_point in decomposed_code_points: 274 translit_file.write('{:s}'.format( 275 unicode_utils.ucs_symbol(decomposed_code_point))) 276 translit_file.write('"') 277 translit_file.write('\n') 278 else: 279 print('Warning: unhandled ligature: {:x} {:s}'.format( 280 code_point, name)) 281 translit_file.write('\n') 282 283if __name__ == "__main__": 284 PARSER = argparse.ArgumentParser( 285 description=''' 286 Generate a translit_compat file from UnicodeData.txt. 287 ''') 288 PARSER.add_argument( 289 '-u', '--unicode_data_file', 290 nargs='?', 291 type=str, 292 default='UnicodeData.txt', 293 help=('The UnicodeData.txt file to read, ' 294 + 'default: %(default)s')) 295 PARSER.add_argument( 296 '-i', '--input_file', 297 nargs='?', 298 type=str, 299 help=''' The original glibc/localedata/locales/translit_compat 300 file.''') 301 PARSER.add_argument( 302 '-o', '--output_file', 303 nargs='?', 304 type=str, 305 default='translit_compat.new', 306 help='''The new translit_compat file, default: %(default)s. If the 307 original glibc/localedata/locales/translit_compat file has 308 been given as an option, the header up to the 309 “translit_start” line and the tail from the “translit_end” 310 line to the end of the file will be copied unchanged into the 311 output file. ''') 312 PARSER.add_argument( 313 '--unicode_version', 314 nargs='?', 315 required=True, 316 type=str, 317 help='The Unicode version of the input files used.') 318 ARGS = PARSER.parse_args() 319 320 unicode_utils.fill_attributes(ARGS.unicode_data_file) 321 HEAD = TAIL = '' 322 if ARGS.input_file: 323 (HEAD, TAIL) = read_input_file(ARGS.input_file) 324 with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: 325 output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) 326 output_transliteration(TRANSLIT_FILE) 327 output_tail(TRANSLIT_FILE, tail=TAIL) 328