1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
3#
4# Generate a translit_compat file from a UnicodeData file.
5# Copyright (C) 2015-2022 Free Software Foundation, Inc.
6# This file is part of the GNU C Library.
7#
8# The GNU C Library is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2.1 of the License, or (at your option) any later version.
12#
13# The GNU C Library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with the GNU C Library; if not, see
20# <https://www.gnu.org/licenses/>.
21
22'''
23Generate a translit_compat file from UnicodeData.txt
24
25To see how this script is used, call it with the “-h” option:
26
27    $ ./gen_translit_compat -h
28    … prints usage message …
29'''
30
31import argparse
32import time
33import unicode_utils
34
35def read_input_file(filename):
36    '''Reads the original glibc translit_compat file to get the
37    original head and tail.
38
39    We want to replace only the part of the file between
40    “translit_start” and “translit_end”
41    '''
42    head = tail = ''
43    with open(filename, mode='r') as translit_file:
44        for line in translit_file:
45            head = head + line
46            if line.startswith('translit_start'):
47                break
48        for line in translit_file:
49            if line.startswith('translit_end'):
50                tail = line
51                break
52        for line in translit_file:
53            tail = tail + line
54    return (head, tail)
55
56def output_head(translit_file, unicode_version, head=''):
57    '''Write the header of the output file, i.e. the part of the file
58    before the “translit_start” line.
59    '''
60    if ARGS.input_file and head:
61        translit_file.write(head)
62    else:
63        translit_file.write('escape_char /\n')
64        translit_file.write('comment_char %\n')
65        translit_file.write(unicode_utils.COMMENT_HEADER)
66        translit_file.write('\n')
67        translit_file.write('% Transliterations of compatibility characters ')
68        translit_file.write('and ligatures.\n')
69        translit_file.write('% Generated automatically from UnicodeData.txt '
70                            + 'by gen_translit_compat.py '
71                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
72                            + 'for Unicode {:s}.\n'.format(unicode_version))
73        translit_file.write('\n')
74        translit_file.write('LC_CTYPE\n')
75        translit_file.write('\n')
76        translit_file.write('translit_start\n')
77
78def output_tail(translit_file, tail=''):
79    '''Write the tail of the output file'''
80    if ARGS.input_file and tail:
81        translit_file.write(tail)
82    else:
83        translit_file.write('translit_end\n')
84        translit_file.write('\n')
85        translit_file.write('END LC_CTYPE\n')
86
87def compatibility_decompose(code_point):
88    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
89
90    “The compatibility decomposition is formed by recursively applying
91    the canonical and compatibility mappings, then applying the
92    Canonical Ordering Algorithm.”
93
94    We don’t do the canonical decomposition here because this is
95    done in gen_translit_combining.py to generate translit_combining.
96
97    And we ignore some of the possible compatibility formatting tags
98    here. Some of them are used in other translit_* files, not
99    translit_compat:
100
101    <font>:   translit_font
102    <circle>: translit_circle
103    <wide>:   translit_wide
104    <narrow>: translit_narrow
105    <square>: translit_cjk_compat
106    <fraction>: translit_fraction
107
108    And we ignore
109
110    <noBreak>, <initial>, <medial>, <final>, <isolated>
111
112    because they seem to be not useful for transliteration.
113    '''
114    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
115        code_point]['decomposition']
116    compatibility_tags = (
117        '<compat>', '<super>', '<sub>', '<vertical>')
118    for compatibility_tag in compatibility_tags:
119        if decomposition.startswith(compatibility_tag):
120            decomposition = decomposition[len(compatibility_tag)+1:]
121            decomposed_code_points = [int(x, 16)
122                                      for x in decomposition.split(' ')]
123            if (len(decomposed_code_points) > 1
124                    and decomposed_code_points[0] == 0x0020
125                    and decomposed_code_points[1] >= 0x0300
126                    and decomposed_code_points[1] <= 0x03FF):
127                # Decomposes into a space followed by a combining character.
128                # This is not useful fo transliteration.
129                return []
130            else:
131                return_value = []
132                for index in range(0, len(decomposed_code_points)):
133                    cd_code_points = compatibility_decompose(
134                        decomposed_code_points[index])
135                    if cd_code_points:
136                        return_value += cd_code_points
137                    else:
138                        return_value += [decomposed_code_points[index]]
139                return return_value
140    return []
141
142def special_decompose(code_point_list):
143    '''
144    Decompositions which are not in UnicodeData.txt at all but which
145    were used in the original translit_compat file in glibc and
146    which seem to make sense.  I want to keep the update of
147    translit_compat close to the spirit of the original file,
148    therefore I added this special decomposition rules here.
149    '''
150    special_decompose_dict = {
151        (0x03BC,): [0x0075], # μ → u
152        (0x02BC,): [0x0027], # ʼ → '
153    }
154    if tuple(code_point_list) in special_decompose_dict:
155        return special_decompose_dict[tuple(code_point_list)]
156    else:
157        return code_point_list
158
159def special_ligature_decompose(code_point):
160    '''
161    Decompositions for ligatures which are not in UnicodeData.txt at
162    all but which were used in the original translit_compat file in
163    glibc and which seem to make sense.  I want to keep the update of
164    translit_compat close to the spirit of the original file,
165    therefore I added these special ligature decomposition rules here.
166
167    '''
168    special_ligature_decompose_dict = {
169        0x00E6: [0x0061, 0x0065], # æ → ae
170        0x00C6: [0x0041, 0x0045], # Æ → AE
171        # These following 5 special ligature decompositions were
172        # in the original glibc/localedata/locales/translit_compat file
173        0x0152: [0x004F, 0x0045], # Œ → OE
174        0x0153: [0x006F, 0x0065], # œ → oe
175        0x05F0: [0x05D5, 0x05D5], # װ → וו
176        0x05F1: [0x05D5, 0x05D9], # ױ → וי
177        0x05F2: [0x05D9, 0x05D9], # ײ → יי
178        # The following special ligature decompositions were
179        # not in the original glibc/localedata/locales/translit_compat file
180        # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
181        # → U+041D CYRILLIC CAPITAL LETTER EN,
182        #   U+0413 CYRILLIC CAPITAL LETTER GHE
183        0x04A4: [0x041D, 0x0413], # Ҥ → НГ
184        # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
185        # → U+043D CYRILLIC SMALL LETTER EN,
186        #   U+0433 CYRILLIC SMALL LETTER GHE
187        0x04A5: [0x043D, 0x0433], # ҥ → нг
188        # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
189        # → U+0422 CYRILLIC CAPITAL LETTER TE,
190        #   U+0426 CYRILLIC CAPITAL LETTER TSE
191        0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
192        # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
193        # → U+0442 CYRILLIC SMALL LETTER TE,
194        #   U+0446 CYRILLIC SMALL LETTER TSE
195        0x04B5: [0x0442, 0x0446], # ҵ → тц
196        # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
197        # → U+0410 CYRILLIC CAPITAL LETTER A
198        #   U+0415;CYRILLIC CAPITAL LETTER IE
199        0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
200        # U+04D5 CYRILLIC SMALL LIGATURE A IE
201        # → U+0430 CYRILLIC SMALL LETTER A,
202        #   U+0435 CYRILLIC SMALL LETTER IE
203        0x04D5: [0x0430, 0x0435], # ӕ → ае
204        # I am not sure what to do with the following ligatures
205        # maybe it makes no sense to decompose them:
206        # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
207        # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
208        # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
209        # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
210        # U+fe20 COMBINING LIGATURE LEFT HALF
211        # U+fe21 COMBINING LIGATURE RIGHT HALF
212        # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
213        # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
214        # U+11176 MAHAJANI LIGATURE SHRI
215        # U+1f670 SCRIPT LIGATURE ET ORNAMENT
216        # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
217        # U+1f672 LIGATURE OPEN ET ORNAMENT
218        # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
219    }
220    if code_point in special_ligature_decompose_dict:
221        return special_ligature_decompose_dict[code_point]
222    else:
223        return [code_point]
224
225def output_transliteration(translit_file):
226    '''Write the new transliteration to the output file'''
227    translit_file.write('\n')
228    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
229        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
230        decomposed_code_points = [compatibility_decompose(code_point)]
231        if not decomposed_code_points[0]:
232            if special_decompose([code_point]) != [code_point]:
233                decomposed_code_points[0] = special_decompose([code_point])
234        else:
235            special_decomposed_code_points = []
236            while True:
237                special_decomposed_code_points = special_decompose(
238                    decomposed_code_points[-1])
239                if (special_decomposed_code_points
240                        != decomposed_code_points[-1]):
241                    decomposed_code_points.append(
242                        special_decomposed_code_points)
243                    continue
244                special_decomposed_code_points = []
245                for decomposed_code_point in decomposed_code_points[-1]:
246                    special_decomposed_code_points += special_decompose(
247                        [decomposed_code_point])
248                if (special_decomposed_code_points
249                        == decomposed_code_points[-1]):
250                    break
251                decomposed_code_points.append(
252                    special_decomposed_code_points)
253        if decomposed_code_points[0]:
254            translit_file.write('% {:s}\n'.format(name))
255            translit_file.write('{:s} '.format(
256                unicode_utils.ucs_symbol(code_point)))
257            for index in range(0, len(decomposed_code_points)):
258                if index > 0:
259                    translit_file.write(';')
260                translit_file.write('"')
261                for decomposed_code_point in decomposed_code_points[index]:
262                    translit_file.write('{:s}'.format(
263                        unicode_utils.ucs_symbol(decomposed_code_point)))
264                translit_file.write('"')
265            translit_file.write('\n')
266        elif 'LIGATURE' in name and 'ARABIC' not in name:
267            decomposed_code_points = special_ligature_decompose(code_point)
268            if decomposed_code_points[0] != code_point:
269                translit_file.write('% {:s}\n'.format(name))
270                translit_file.write('{:s} '.format(
271                    unicode_utils.ucs_symbol(code_point)))
272                translit_file.write('"')
273                for decomposed_code_point in decomposed_code_points:
274                    translit_file.write('{:s}'.format(
275                        unicode_utils.ucs_symbol(decomposed_code_point)))
276                translit_file.write('"')
277                translit_file.write('\n')
278            else:
279                print('Warning: unhandled ligature: {:x} {:s}'.format(
280                    code_point, name))
281    translit_file.write('\n')
282
283if __name__ == "__main__":
284    PARSER = argparse.ArgumentParser(
285        description='''
286        Generate a translit_compat file from UnicodeData.txt.
287        ''')
288    PARSER.add_argument(
289        '-u', '--unicode_data_file',
290        nargs='?',
291        type=str,
292        default='UnicodeData.txt',
293        help=('The UnicodeData.txt file to read, '
294              + 'default: %(default)s'))
295    PARSER.add_argument(
296        '-i', '--input_file',
297        nargs='?',
298        type=str,
299        help=''' The original glibc/localedata/locales/translit_compat
300        file.''')
301    PARSER.add_argument(
302        '-o', '--output_file',
303        nargs='?',
304        type=str,
305        default='translit_compat.new',
306        help='''The new translit_compat file, default: %(default)s.  If the
307        original glibc/localedata/locales/translit_compat file has
308        been given as an option, the header up to the
309        “translit_start” line and the tail from the “translit_end”
310        line to the end of the file will be copied unchanged into the
311        output file.  ''')
312    PARSER.add_argument(
313        '--unicode_version',
314        nargs='?',
315        required=True,
316        type=str,
317        help='The Unicode version of the input files used.')
318    ARGS = PARSER.parse_args()
319
320    unicode_utils.fill_attributes(ARGS.unicode_data_file)
321    HEAD = TAIL = ''
322    if ARGS.input_file:
323        (HEAD, TAIL) = read_input_file(ARGS.input_file)
324    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
325        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
326        output_transliteration(TRANSLIT_FILE)
327        output_tail(TRANSLIT_FILE, tail=TAIL)
328