1#!/usr/bin/python3 2# -*- coding: utf-8 -*- 3# 4# Generate a translit_fraction file from a UnicodeData file. 5# Copyright (C) 2015-2022 Free Software Foundation, Inc. 6# This file is part of the GNU C Library. 7# 8# The GNU C Library is free software; you can redistribute it and/or 9# modify it under the terms of the GNU Lesser General Public 10# License as published by the Free Software Foundation; either 11# version 2.1 of the License, or (at your option) any later version. 12# 13# The GNU C Library is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16# Lesser General Public License for more details. 17# 18# You should have received a copy of the GNU Lesser General Public 19# License along with the GNU C Library; if not, see 20# <https://www.gnu.org/licenses/>. 21 22''' 23Generate a translit_fraction file from UnicodeData.txt 24 25To see how this script is used, call it with the “-h” option: 26 27 $ ./gen_translit_fraction -h 28 … prints usage message … 29''' 30 31import argparse 32import time 33import unicode_utils 34 35def read_input_file(filename): 36 '''Reads the original glibc translit_fraction file to get the 37 original head and tail. 38 39 We want to replace only the part of the file between 40 “translit_start” and “translit_end” 41 ''' 42 head = tail = '' 43 with open(filename, mode='r') as translit_file: 44 for line in translit_file: 45 head = head + line 46 if line.startswith('translit_start'): 47 break 48 for line in translit_file: 49 if line.startswith('translit_end'): 50 tail = line 51 break 52 for line in translit_file: 53 tail = tail + line 54 return (head, tail) 55 56def output_head(translit_file, unicode_version, head=''): 57 '''Write the header of the output file, i.e. the part of the file 58 before the “translit_start” line. 59 ''' 60 if ARGS.input_file and head: 61 translit_file.write(head) 62 else: 63 translit_file.write('escape_char /\n') 64 translit_file.write('comment_char %\n') 65 translit_file.write(unicode_utils.COMMENT_HEADER) 66 translit_file.write('\n') 67 translit_file.write('% Transliterations of fractions.\n') 68 translit_file.write('% Generated automatically from UnicodeData.txt ' 69 + 'by gen_translit_fraction.py ' 70 + 'on {:s} '.format(time.strftime('%Y-%m-%d')) 71 + 'for Unicode {:s}.\n'.format(unicode_version)) 72 translit_file.write('% The replacements have been surrounded ') 73 translit_file.write('with spaces, because fractions are\n') 74 translit_file.write('% often preceded by a decimal number and ') 75 translit_file.write('followed by a unit or a math symbol.\n') 76 translit_file.write('\n') 77 translit_file.write('LC_CTYPE\n') 78 translit_file.write('\n') 79 translit_file.write('translit_start\n') 80 81def output_tail(translit_file, tail=''): 82 '''Write the tail of the output file''' 83 if ARGS.input_file and tail: 84 translit_file.write(tail) 85 else: 86 translit_file.write('translit_end\n') 87 translit_file.write('\n') 88 translit_file.write('END LC_CTYPE\n') 89 90def special_decompose(code_point_list): 91 ''' 92 Decompositions which are not in UnicodeData.txt at all but which 93 were used in the original translit_fraction file in glibc and 94 which seem to make sense. I want to keep the update of 95 translit_fraction close to the spirit of the original file, 96 therefore I added this special decomposition rules here. 97 ''' 98 special_decompose_dict = { 99 (0x2044,): [0x002F], # ⁄ → / 100 } 101 if tuple(code_point_list) in special_decompose_dict: 102 return special_decompose_dict[tuple(code_point_list)] 103 else: 104 return code_point_list 105 106def output_transliteration(translit_file): 107 '''Write the new transliteration to the output file''' 108 translit_file.write('\n') 109 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): 110 name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] 111 decomposition = unicode_utils.UNICODE_ATTRIBUTES[ 112 code_point]['decomposition'] 113 if decomposition.startswith('<fraction>'): 114 decomposition = decomposition[11:] 115 decomposed_code_points = [[int(x, 16) 116 for x in decomposition.split(' ')]] 117 if decomposed_code_points[0]: 118 decomposed_code_points[0] = [0x0020] \ 119 + decomposed_code_points[0] \ 120 + [0x0020] 121 while True: 122 special_decomposed_code_points = special_decompose( 123 decomposed_code_points[-1]) 124 if (special_decomposed_code_points 125 != decomposed_code_points[-1]): 126 decomposed_code_points.append( 127 special_decomposed_code_points) 128 continue 129 special_decomposed_code_points = [] 130 for decomposed_code_point in decomposed_code_points[-1]: 131 special_decomposed_code_points += special_decompose( 132 [decomposed_code_point]) 133 if (special_decomposed_code_points 134 == decomposed_code_points[-1]): 135 break 136 decomposed_code_points.append( 137 special_decomposed_code_points) 138 translit_file.write('% {:s}\n'.format(name)) 139 translit_file.write('{:s} '.format( 140 unicode_utils.ucs_symbol(code_point))) 141 for index in range(0, len(decomposed_code_points)): 142 if index > 0: 143 translit_file.write(';') 144 if len(decomposed_code_points[index]) > 1: 145 translit_file.write('"') 146 for decomposed_code_point in decomposed_code_points[index]: 147 translit_file.write('{:s}'.format( 148 unicode_utils.ucs_symbol(decomposed_code_point))) 149 if len(decomposed_code_points[index]) > 1: 150 translit_file.write('"') 151 translit_file.write('\n') 152 translit_file.write('\n') 153 154if __name__ == "__main__": 155 PARSER = argparse.ArgumentParser( 156 description=''' 157 Generate a translit_cjk_compat file from UnicodeData.txt. 158 ''') 159 PARSER.add_argument( 160 '-u', '--unicode_data_file', 161 nargs='?', 162 type=str, 163 default='UnicodeData.txt', 164 help=('The UnicodeData.txt file to read, ' 165 + 'default: %(default)s')) 166 PARSER.add_argument( 167 '-i', '--input_file', 168 nargs='?', 169 type=str, 170 help=''' The original glibc/localedata/locales/translit_fraction 171 file.''') 172 PARSER.add_argument( 173 '-o', '--output_file', 174 nargs='?', 175 type=str, 176 default='translit_fraction.new', 177 help='''The new translit_fraction file, default: %(default)s. If the 178 original glibc/localedata/locales/translit_fraction file has 179 been given as an option, the header up to the 180 “translit_start” line and the tail from the “translit_end” 181 line to the end of the file will be copied unchanged into the 182 output file. ''') 183 PARSER.add_argument( 184 '--unicode_version', 185 nargs='?', 186 required=True, 187 type=str, 188 help='The Unicode version of the input files used.') 189 ARGS = PARSER.parse_args() 190 191 unicode_utils.fill_attributes(ARGS.unicode_data_file) 192 HEAD = TAIL = '' 193 if ARGS.input_file: 194 (HEAD, TAIL) = read_input_file(ARGS.input_file) 195 with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: 196 output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) 197 output_transliteration(TRANSLIT_FILE) 198 output_tail(TRANSLIT_FILE, tail=TAIL) 199