1#!/usr/bin/python3 2# Generate the locale/C-translit.h file. 3# Copyright (C) 2018-2022 Free Software Foundation, Inc. 4# This file is part of the GNU C Library. 5# 6# The GNU C Library is free software; you can redistribute it and/or 7# modify it under the terms of the GNU Lesser General Public 8# License as published by the Free Software Foundation; either 9# version 2.1 of the License, or (at your option) any later version. 10# 11# The GNU C Library is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14# Lesser General Public License for more details. 15# 16# You should have received a copy of the GNU Lesser General Public 17# License along with the GNU C Library; if not, see 18# <https://www.gnu.org/licenses/>. 19 20import re 21import sys 22 23 24class StringLiteral: 25 "Source of a string literal and its decomposition into code points." 26 def __init__(self, s): 27 # States: 28 # 0 regular character sequence 29 # 1 backslash seen 30 # 2 in hexadecimal escape sequence 31 state = 0 32 result = [] 33 for ch in s: 34 if state == 0: 35 if ch == '\\': 36 state = 1 37 else: 38 result.append(ord(ch)) 39 elif state == 1: 40 if ch in "\\\"": 41 result.append(ord(ch)) 42 state = 0 43 elif ch == 'x': 44 state = 2 45 result.append(0) 46 else: 47 raise ValueError("invalid character {!r} in {!r}".format( 48 ch, s)) 49 elif state == 2: 50 if ch in "0123456789abcdefABCDEF": 51 result[-1] = result[-1] * 16 + int(ch, 16) 52 else: 53 if ch == '\\': 54 state = 1 55 else: 56 state = 0 57 if state == 1: 58 raise ValueError("trailing backslash in {!r}".format(s)) 59 60 self.source = s 61 self.decoded = tuple(result) 62 63 64class Translit: 65 "Pair of transliteration and source." 66 67 __RE_TRANSLIT = re.compile( 68 r'^"((?:[^"\\]|\\x[0-9a-fA-F])+)"\s+' 69 r'"((?:[^"\\]|\\["\\])*)"\s*(?:#.*)?$') 70 71 def __init__(self, line): 72 match = self.__RE_TRANSLIT.match(line) 73 if not match: 74 raise IOError("invalid line {}: {!r}".format( 75 lineno + 1, line)) 76 codepoints, replacement = match.groups() 77 self.codepoints = StringLiteral(codepoints) 78 self.replacement = StringLiteral(replacement) 79 80 81# List of Translit objects. 82translits = [] 83 84# Read transliterations from standard input. 85for lineno, line in enumerate(sys.stdin): 86 line = line.strip() 87 # Skip empty lines and comments. 88 if (not line) or line[0] == '#': 89 continue 90 translit = Translit(line) 91 # Check ordering of codepoints. 92 if translits \ 93 and translit.codepoints.decoded <= translits[-1].codepoints.decoded: 94 raise IOError("unexpected codepoint {!r} on line {}: {!r}".format( 95 translit.codepoints.decoded, lineno + 1, line)) 96 translits.append(translit) 97 98# Generate the C sources. 99write = sys.stdout.write 100write("#include <stdint.h>\n") 101write("#define NTRANSLIT {}\n".format(len(translits))) 102 103write("static const uint32_t translit_from_idx[] =\n{\n ") 104col = 2 105total = 0 106for translit in translits: 107 if total > 0: 108 if col + 7 >= 79: 109 write(",\n ") 110 col = 2 111 else: 112 write(", ") 113 col += 2 114 write("{:4}".format(total)) 115 total += len(translit.codepoints.decoded) + 1 116 col += 4 117write("\n};\n") 118 119write("static const wchar_t translit_from_tbl[] =\n ") 120col = 1 121first = True 122for translit in translits: 123 if first: 124 first = False 125 else: 126 if col + 6 >= 79: 127 write("\n ") 128 col = 1 129 write(" L\"\\0\"") 130 col += 6 131 if col > 2 and col + len(translit.codepoints.source) + 4 >= 79: 132 write("\n ") 133 col = 2 134 else: 135 write(" ") 136 col += 1 137 write("L\"{}\"".format(translit.codepoints.source)) 138 col += len(translit.codepoints.source) + 3 139write(";\n") 140 141write("static const uint32_t translit_to_idx[] =\n{\n ") 142col = 2 143total = 0 144for translit in translits: 145 if total > 0: 146 if col + 7 >= 79: 147 write(",\n ") 148 col = 2 149 else: 150 write(", ") 151 col += 2 152 write("{:4}".format(total)) 153 total += len(translit.replacement.decoded) + 2 154 col += 4 155write("\n};\n") 156 157write("static const wchar_t translit_to_tbl[] =\n ") 158col = 1 159first = True 160for translit in translits: 161 if first: 162 first = False 163 else: 164 if col + 6 >= 79: 165 write("\n ") 166 col = 1 167 write(" L\"\\0\"") 168 col += 6 169 if col > 2 and col + len(translit.replacement.source) + 6 >= 79: 170 write("\n ") 171 col = 2 172 else: 173 write(" ") 174 col += 1 175 write("L\"{}\\0\"".format(translit.replacement.source)) 176 col += len(translit.replacement.source) + 5 177write(";\n") 178