1#!/usr/bin/python3
2# Generate the locale/C-translit.h file.
3# Copyright (C) 2018-2022 Free Software Foundation, Inc.
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
18# <https://www.gnu.org/licenses/>.
19
20import re
21import sys
22
23
24class StringLiteral:
25    "Source of a string literal and its decomposition into code points."
26    def __init__(self, s):
27        # States:
28        #  0 regular character sequence
29        #  1 backslash seen
30        #  2 in hexadecimal escape sequence
31        state = 0
32        result = []
33        for ch in s:
34            if state == 0:
35                if ch == '\\':
36                    state = 1
37                else:
38                    result.append(ord(ch))
39            elif state == 1:
40                if ch in "\\\"":
41                    result.append(ord(ch))
42                    state = 0
43                elif ch == 'x':
44                    state = 2
45                    result.append(0)
46                else:
47                    raise ValueError("invalid character {!r} in {!r}".format(
48                        ch, s))
49            elif state == 2:
50                if ch in "0123456789abcdefABCDEF":
51                    result[-1] = result[-1] * 16 + int(ch, 16)
52                else:
53                    if ch == '\\':
54                        state = 1
55                    else:
56                        state = 0
57        if state == 1:
58            raise ValueError("trailing backslash in {!r}".format(s))
59
60        self.source = s
61        self.decoded = tuple(result)
62
63
64class Translit:
65    "Pair of transliteration and source."
66
67    __RE_TRANSLIT = re.compile(
68        r'^"((?:[^"\\]|\\x[0-9a-fA-F])+)"\s+'
69        r'"((?:[^"\\]|\\["\\])*)"\s*(?:#.*)?$')
70
71    def __init__(self, line):
72        match = self.__RE_TRANSLIT.match(line)
73        if not match:
74            raise IOError("invalid line {}: {!r}".format(
75                lineno + 1, line))
76        codepoints, replacement = match.groups()
77        self.codepoints = StringLiteral(codepoints)
78        self.replacement = StringLiteral(replacement)
79
80
81# List of Translit objects.
82translits = []
83
84# Read transliterations from standard input.
85for lineno, line in enumerate(sys.stdin):
86    line = line.strip()
87    # Skip empty lines and comments.
88    if (not line) or line[0] == '#':
89        continue
90    translit = Translit(line)
91    # Check ordering of codepoints.
92    if translits \
93       and translit.codepoints.decoded <= translits[-1].codepoints.decoded:
94        raise IOError("unexpected codepoint {!r} on line {}: {!r}".format(
95            translit.codepoints.decoded, lineno + 1, line))
96    translits.append(translit)
97
98# Generate the C sources.
99write = sys.stdout.write
100write("#include <stdint.h>\n")
101write("#define NTRANSLIT {}\n".format(len(translits)))
102
103write("static const uint32_t translit_from_idx[] =\n{\n  ")
104col = 2
105total = 0
106for translit in translits:
107    if total > 0:
108        if col + 7 >= 79:
109            write(",\n  ")
110            col = 2
111        else:
112            write(", ")
113            col += 2
114    write("{:4}".format(total))
115    total += len(translit.codepoints.decoded) + 1
116    col += 4
117write("\n};\n")
118
119write("static const wchar_t translit_from_tbl[] =\n ")
120col = 1
121first = True
122for translit in translits:
123    if first:
124        first = False
125    else:
126        if col + 6 >= 79:
127            write("\n ")
128            col = 1
129        write(" L\"\\0\"")
130        col += 6
131    if col > 2 and col + len(translit.codepoints.source) + 4 >= 79:
132        write("\n  ")
133        col = 2
134    else:
135        write(" ")
136        col += 1
137    write("L\"{}\"".format(translit.codepoints.source))
138    col += len(translit.codepoints.source) + 3
139write(";\n")
140
141write("static const uint32_t translit_to_idx[] =\n{\n  ")
142col = 2
143total = 0
144for translit in translits:
145    if total > 0:
146        if col + 7 >= 79:
147            write(",\n  ")
148            col = 2
149        else:
150            write(", ")
151            col += 2
152    write("{:4}".format(total))
153    total += len(translit.replacement.decoded) + 2
154    col += 4
155write("\n};\n")
156
157write("static const wchar_t translit_to_tbl[] =\n ")
158col = 1
159first = True
160for translit in translits:
161    if first:
162        first = False
163    else:
164        if col + 6 >= 79:
165            write("\n ")
166            col = 1
167        write(" L\"\\0\"")
168        col += 6
169    if col > 2 and col + len(translit.replacement.source) + 6 >= 79:
170        write("\n  ")
171        col = 2
172    else:
173        write(" ")
174        col += 1
175    write("L\"{}\\0\"".format(translit.replacement.source))
176    col += len(translit.replacement.source) + 5
177write(";\n")
178