1#!/usr/bin/python3 2# -*- coding: utf-8 -*- 3# 4# Generate a translit_cjk_compat file from a UnicodeData file. 5# Copyright (C) 2015-2022 Free Software Foundation, Inc. 6# This file is part of the GNU C Library. 7# 8# The GNU C Library is free software; you can redistribute it and/or 9# modify it under the terms of the GNU Lesser General Public 10# License as published by the Free Software Foundation; either 11# version 2.1 of the License, or (at your option) any later version. 12# 13# The GNU C Library is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16# Lesser General Public License for more details. 17# 18# You should have received a copy of the GNU Lesser General Public 19# License along with the GNU C Library; if not, see 20# <https://www.gnu.org/licenses/>. 21 22''' 23Generate a translit_cjk_compat file from UnicodeData.txt 24 25To see how this script is used, call it with the “-h” option: 26 27 $ ./gen_translit_cjk_compat -h 28 … prints usage message … 29''' 30 31import argparse 32import time 33import sys 34import unicode_utils 35 36def read_input_file(filename): 37 '''Reads the original glibc translit_cjk_compat file to get the 38 original head and tail. 39 40 We want to replace only the part of the file between 41 “translit_start” and “translit_end” 42 ''' 43 head = tail = '' 44 with open(filename, mode='r') as translit_file: 45 for line in translit_file: 46 head = head + line 47 if line.startswith('translit_start'): 48 break 49 for line in translit_file: 50 if line.startswith('translit_end'): 51 tail = line 52 break 53 for line in translit_file: 54 tail = tail + line 55 return (head, tail) 56 57def output_head(translit_file, unicode_version, head=''): 58 '''Write the header of the output file, i.e. the part of the file 59 before the “translit_start” line. 60 ''' 61 if ARGS.input_file and head: 62 translit_file.write(head) 63 else: 64 translit_file.write('escape_char /\n') 65 translit_file.write('comment_char %\n') 66 translit_file.write(unicode_utils.COMMENT_HEADER) 67 translit_file.write('\n') 68 translit_file.write('% Transliterations of CJK compatibility ') 69 translit_file.write('characters.\n') 70 translit_file.write('% Generated automatically from UnicodeData.txt ' 71 + 'by gen_translit_cjk_compat.py ' 72 + 'on {:s} '.format(time.strftime('%Y-%m-%d')) 73 + 'for Unicode {:s}.\n'.format(unicode_version)) 74 translit_file.write('\n') 75 translit_file.write('LC_CTYPE\n') 76 translit_file.write('\n') 77 translit_file.write('translit_start\n') 78 79def output_tail(translit_file, tail=''): 80 '''Write the tail of the output file''' 81 if ARGS.input_file and tail: 82 translit_file.write(tail) 83 else: 84 translit_file.write('translit_end\n') 85 translit_file.write('\n') 86 translit_file.write('END LC_CTYPE\n') 87 88def special_decompose(code_point_list): 89 ''' 90 Decompositions which are not in UnicodeData.txt at all but which 91 were used in the original translit_cjk_compat file in glibc and 92 which seem to make sense. I want to keep the update of 93 translit_cjk_compat close to the spirit of the original file, 94 therefore I added this special decomposition rules here. 95 ''' 96 special_decompose_dict = { 97 (0x2215,): [0x002F], # ∕ → / 98 (0x00B2,): [0x005E, 0x0032], # ² → ^2 99 (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN) 100 (0x2113,): [0x006C], # ℓ → l 101 (0x00B3,): [0x005E, 0x0033], # ³ → ^3 102 (0x00B5,): [0x0075], # µ → u 103 (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl 104 (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [ 105 0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2], 106 (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2], 107 } 108 if tuple(code_point_list) in special_decompose_dict: 109 return special_decompose_dict[tuple(code_point_list)] 110 else: 111 return code_point_list 112 113def output_transliteration(translit_file): 114 '''Write the new transliteration to the output file''' 115 translit_file.write('\n') 116 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): 117 name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] 118 decomposition = unicode_utils.UNICODE_ATTRIBUTES[ 119 code_point]['decomposition'] 120 if decomposition.startswith('<square>'): 121 decomposition = decomposition[9:] 122 decomposed_code_points = [[int(x, 16) 123 for x in decomposition.split(' ')]] 124 if decomposed_code_points[0]: 125 while True: 126 special_decomposed_code_points = special_decompose( 127 decomposed_code_points[-1]) 128 if (special_decomposed_code_points 129 != decomposed_code_points[-1]): 130 decomposed_code_points.append( 131 special_decomposed_code_points) 132 continue 133 special_decomposed_code_points = [] 134 for decomposed_code_point in decomposed_code_points[-1]: 135 special_decomposed_code_points += special_decompose( 136 [decomposed_code_point]) 137 if (special_decomposed_code_points 138 == decomposed_code_points[-1]): 139 break 140 decomposed_code_points.append( 141 special_decomposed_code_points) 142 translit_file.write('% {:s}\n'.format(name)) 143 translit_file.write('{:s} '.format( 144 unicode_utils.ucs_symbol(code_point))) 145 for index in range(0, len(decomposed_code_points)): 146 if index > 0: 147 translit_file.write(';') 148 if len(decomposed_code_points[index]) > 1: 149 translit_file.write('"') 150 for decomposed_code_point in decomposed_code_points[index]: 151 translit_file.write('{:s}'.format( 152 unicode_utils.ucs_symbol(decomposed_code_point))) 153 if len(decomposed_code_points[index]) > 1: 154 translit_file.write('"') 155 translit_file.write('\n') 156 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): 157 name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] 158 decomposition = unicode_utils.UNICODE_ATTRIBUTES[ 159 code_point]['decomposition'] 160 if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'): 161 decomposed_code_points = [int(x, 16) 162 for x in decomposition.split(' ')] 163 if len(decomposed_code_points) != 1: 164 sys.stderr.write( 165 'Unexpected decomposition length {:x} {:s} {:s}\n'.format( 166 code_point, name, decomposition)) 167 exit(1) 168 translit_file.write('% {:s}\n'.format(name)) 169 translit_file.write('{:s} '.format( 170 unicode_utils.ucs_symbol(code_point))) 171 for decomposed_code_point in decomposed_code_points: 172 translit_file.write('{:s}'.format( 173 unicode_utils.ucs_symbol(decomposed_code_point))) 174 translit_file.write('\n') 175 translit_file.write('\n') 176 177if __name__ == "__main__": 178 PARSER = argparse.ArgumentParser( 179 description=''' 180 Generate a translit_cjk_compat file from UnicodeData.txt. 181 ''') 182 PARSER.add_argument( 183 '-u', '--unicode_data_file', 184 nargs='?', 185 type=str, 186 default='UnicodeData.txt', 187 help=('The UnicodeData.txt file to read, ' 188 + 'default: %(default)s')) 189 PARSER.add_argument( 190 '-i', '--input_file', 191 nargs='?', 192 type=str, 193 help=''' The original glibc/localedata/locales/translit_cjk_compat 194 file.''') 195 PARSER.add_argument( 196 '-o', '--output_file', 197 nargs='?', 198 type=str, 199 default='translit_cjk_compat.new', 200 help='''The new translit_cjk_compat file, default: %(default)s. If the 201 original glibc/localedata/locales/translit_cjk_compat file has 202 been given as an option, the header up to the 203 “translit_start” line and the tail from the “translit_end” 204 line to the end of the file will be copied unchanged into the 205 output file. ''') 206 PARSER.add_argument( 207 '--unicode_version', 208 nargs='?', 209 required=True, 210 type=str, 211 help='The Unicode version of the input files used.') 212 ARGS = PARSER.parse_args() 213 214 unicode_utils.fill_attributes(ARGS.unicode_data_file) 215 HEAD = TAIL = '' 216 if ARGS.input_file: 217 (HEAD, TAIL) = read_input_file(ARGS.input_file) 218 with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: 219 output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) 220 output_transliteration(TRANSLIT_FILE) 221 output_tail(TRANSLIT_FILE, tail=TAIL) 222