1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
3#
4# Generate a translit_fraction file from a UnicodeData file.
5# Copyright (C) 2015-2022 Free Software Foundation, Inc.
6# This file is part of the GNU C Library.
7#
8# The GNU C Library is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2.1 of the License, or (at your option) any later version.
12#
13# The GNU C Library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with the GNU C Library; if not, see
20# <https://www.gnu.org/licenses/>.
21
22'''
23Generate a translit_fraction file from UnicodeData.txt
24
25To see how this script is used, call it with the “-h” option:
26
27    $ ./gen_translit_fraction -h
28    … prints usage message …
29'''
30
31import argparse
32import time
33import unicode_utils
34
35def read_input_file(filename):
36    '''Reads the original glibc translit_fraction file to get the
37    original head and tail.
38
39    We want to replace only the part of the file between
40    “translit_start” and “translit_end”
41    '''
42    head = tail = ''
43    with open(filename, mode='r') as translit_file:
44        for line in translit_file:
45            head = head + line
46            if line.startswith('translit_start'):
47                break
48        for line in translit_file:
49            if line.startswith('translit_end'):
50                tail = line
51                break
52        for line in translit_file:
53            tail = tail + line
54    return (head, tail)
55
56def output_head(translit_file, unicode_version, head=''):
57    '''Write the header of the output file, i.e. the part of the file
58    before the “translit_start” line.
59    '''
60    if ARGS.input_file and head:
61        translit_file.write(head)
62    else:
63        translit_file.write('escape_char /\n')
64        translit_file.write('comment_char %\n')
65        translit_file.write(unicode_utils.COMMENT_HEADER)
66        translit_file.write('\n')
67        translit_file.write('% Transliterations of fractions.\n')
68        translit_file.write('% Generated automatically from UnicodeData.txt '
69                            + 'by gen_translit_fraction.py '
70                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
71                            + 'for Unicode {:s}.\n'.format(unicode_version))
72        translit_file.write('% The replacements have been surrounded ')
73        translit_file.write('with spaces, because fractions are\n')
74        translit_file.write('% often preceded by a decimal number and ')
75        translit_file.write('followed by a unit or a math symbol.\n')
76        translit_file.write('\n')
77        translit_file.write('LC_CTYPE\n')
78        translit_file.write('\n')
79        translit_file.write('translit_start\n')
80
81def output_tail(translit_file, tail=''):
82    '''Write the tail of the output file'''
83    if ARGS.input_file and tail:
84        translit_file.write(tail)
85    else:
86        translit_file.write('translit_end\n')
87        translit_file.write('\n')
88        translit_file.write('END LC_CTYPE\n')
89
90def special_decompose(code_point_list):
91    '''
92    Decompositions which are not in UnicodeData.txt at all but which
93    were used in the original translit_fraction file in glibc and
94    which seem to make sense.  I want to keep the update of
95    translit_fraction close to the spirit of the original file,
96    therefore I added this special decomposition rules here.
97    '''
98    special_decompose_dict = {
99        (0x2044,): [0x002F], # ⁄ → /
100    }
101    if tuple(code_point_list) in special_decompose_dict:
102        return special_decompose_dict[tuple(code_point_list)]
103    else:
104        return code_point_list
105
106def output_transliteration(translit_file):
107    '''Write the new transliteration to the output file'''
108    translit_file.write('\n')
109    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
110        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
111        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
112            code_point]['decomposition']
113        if decomposition.startswith('<fraction>'):
114            decomposition = decomposition[11:]
115            decomposed_code_points = [[int(x, 16)
116                                       for x in decomposition.split(' ')]]
117            if decomposed_code_points[0]:
118                decomposed_code_points[0] = [0x0020] \
119                                            + decomposed_code_points[0] \
120                                            + [0x0020]
121                while True:
122                    special_decomposed_code_points = special_decompose(
123                        decomposed_code_points[-1])
124                    if (special_decomposed_code_points
125                            != decomposed_code_points[-1]):
126                        decomposed_code_points.append(
127                            special_decomposed_code_points)
128                        continue
129                    special_decomposed_code_points = []
130                    for decomposed_code_point in decomposed_code_points[-1]:
131                        special_decomposed_code_points += special_decompose(
132                            [decomposed_code_point])
133                    if (special_decomposed_code_points
134                            == decomposed_code_points[-1]):
135                        break
136                    decomposed_code_points.append(
137                        special_decomposed_code_points)
138                translit_file.write('% {:s}\n'.format(name))
139                translit_file.write('{:s} '.format(
140                    unicode_utils.ucs_symbol(code_point)))
141                for index in range(0, len(decomposed_code_points)):
142                    if index > 0:
143                        translit_file.write(';')
144                    if len(decomposed_code_points[index]) > 1:
145                        translit_file.write('"')
146                    for decomposed_code_point in decomposed_code_points[index]:
147                        translit_file.write('{:s}'.format(
148                            unicode_utils.ucs_symbol(decomposed_code_point)))
149                    if len(decomposed_code_points[index]) > 1:
150                        translit_file.write('"')
151                translit_file.write('\n')
152    translit_file.write('\n')
153
154if __name__ == "__main__":
155    PARSER = argparse.ArgumentParser(
156        description='''
157        Generate a translit_cjk_compat file from UnicodeData.txt.
158        ''')
159    PARSER.add_argument(
160        '-u', '--unicode_data_file',
161        nargs='?',
162        type=str,
163        default='UnicodeData.txt',
164        help=('The UnicodeData.txt file to read, '
165              + 'default: %(default)s'))
166    PARSER.add_argument(
167        '-i', '--input_file',
168        nargs='?',
169        type=str,
170        help=''' The original glibc/localedata/locales/translit_fraction
171        file.''')
172    PARSER.add_argument(
173        '-o', '--output_file',
174        nargs='?',
175        type=str,
176        default='translit_fraction.new',
177        help='''The new translit_fraction file, default: %(default)s.  If the
178        original glibc/localedata/locales/translit_fraction file has
179        been given as an option, the header up to the
180        “translit_start” line and the tail from the “translit_end”
181        line to the end of the file will be copied unchanged into the
182        output file.  ''')
183    PARSER.add_argument(
184        '--unicode_version',
185        nargs='?',
186        required=True,
187        type=str,
188        help='The Unicode version of the input files used.')
189    ARGS = PARSER.parse_args()
190
191    unicode_utils.fill_attributes(ARGS.unicode_data_file)
192    HEAD = TAIL = ''
193    if ARGS.input_file:
194        (HEAD, TAIL) = read_input_file(ARGS.input_file)
195    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
196        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
197        output_transliteration(TRANSLIT_FILE)
198        output_tail(TRANSLIT_FILE, tail=TAIL)
199