1#! /usr/bin/python3 2# Copyright (C) 2019-2022 Free Software Foundation, Inc. 3# This file is part of the GNU C Library. 4# 5# The GNU C Library is free software; you can redistribute it and/or 6# modify it under the terms of the GNU Lesser General Public 7# License as published by the Free Software Foundation; either 8# version 2.1 of the License, or (at your option) any later version. 9# 10# The GNU C Library is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13# Lesser General Public License for more details. 14# 15# You should have received a copy of the GNU Lesser General Public 16# License along with the GNU C Library; if not, see 17# <https://www.gnu.org/licenses/>. 18 19"""Verifies that installed headers do not use any obsolete constructs: 20 * legacy BSD typedefs superseded by <stdint.h>: 21 ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t 22 (sys/types.h is allowed to _define_ these types, but not to use them 23 to define anything else). 24""" 25 26import argparse 27import collections 28import re 29import sys 30 31# Simplified lexical analyzer for C preprocessing tokens. 32# Does not implement trigraphs. 33# Does not implement backslash-newline in the middle of any lexical 34# item other than a string literal. 35# Does not implement universal-character-names in identifiers. 36# Treats prefixed strings (e.g. L"...") as two tokens (L and "...") 37# Accepts non-ASCII characters only within comments and strings. 38 39# Caution: The order of the outermost alternation matters. 40# STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST, 41# BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must 42# be last. 43# Caution: There should be no capturing groups other than the named 44# captures in the outermost alternation. 45 46# For reference, these are all of the C punctuators as of C11: 47# [ ] ( ) { } , ; ? ~ 48# ! != * *= / /= ^ ^= = == 49# # ## 50# % %= %> %: %:%: 51# & &= && 52# | |= || 53# + += ++ 54# - -= -- -> 55# . ... 56# : :> 57# < <% <: << <<= <= 58# > >= >> >>= 59 60# The BAD_* tokens are not part of the official definition of pp-tokens; 61# they match unclosed strings, character constants, and block comments, 62# so that the regex engine doesn't have to backtrack all the way to the 63# beginning of a broken construct and then emit dozens of junk tokens. 64 65PP_TOKEN_RE_ = re.compile(r""" 66 (?P<STRING> \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\") 67 |(?P<BAD_STRING> \"(?:[^\"\\\r\n]|\\[ -~])*) 68 |(?P<CHARCONST> \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\') 69 |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*) 70 |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/) 71 |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?) 72 |(?P<LINE_COMMENT> //[^\r\n]*) 73 |(?P<IDENT> [_a-zA-Z][_a-zA-Z0-9]*) 74 |(?P<PP_NUMBER> \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*) 75 |(?P<PUNCTUATOR> 76 [,;?~(){}\[\]] 77 | [!*/^=]=? 78 | \#\#? 79 | %(?:[=>]|:(?:%:)?)? 80 | &[=&]? 81 |\|[=|]? 82 |\+[=+]? 83 | -[=->]? 84 |\.(?:\.\.)? 85 | :>? 86 | <(?:[%:]|<(?:=|<=?)?)? 87 | >(?:=|>=?)?) 88 |(?P<ESCNL> \\(?:\r|\n|\r\n)) 89 |(?P<WHITESPACE> [ \t\n\r\v\f]+) 90 |(?P<OTHER> .) 91""", re.DOTALL | re.VERBOSE) 92 93HEADER_NAME_RE_ = re.compile(r""" 94 < [^>\r\n]+ > 95 | " [^"\r\n]+ " 96""", re.DOTALL | re.VERBOSE) 97 98ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""") 99 100# based on the sample code in the Python re documentation 101Token_ = collections.namedtuple("Token", ( 102 "kind", "text", "line", "column", "context")) 103Token_.__doc__ = """ 104 One C preprocessing token, comment, or chunk of whitespace. 105 'kind' identifies the token type, which will be one of: 106 STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT, 107 PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME, 108 or OTHER. The BAD_* alternatives in PP_TOKEN_RE_ are 109 handled within tokenize_c, below. 110 111 'text' is the sequence of source characters making up the token; 112 no decoding whatsoever is performed. 113 114 'line' and 'column' give the position of the first character of the 115 token within the source file. They are both 1-based. 116 117 'context' indicates whether or not this token occurred within a 118 preprocessing directive; it will be None for running text, 119 '<null>' for the leading '#' of a directive line (because '#' 120 all by itself on a line is a "null directive"), or the name of 121 the directive for tokens within a directive line, starting with 122 the IDENT for the name itself. 123""" 124 125def tokenize_c(file_contents, reporter): 126 """Yield a series of Token objects, one for each preprocessing 127 token, comment, or chunk of whitespace within FILE_CONTENTS. 128 The REPORTER object is expected to have one method, 129 reporter.error(token, message), which will be called to 130 indicate a lexical error at the position of TOKEN. 131 If MESSAGE contains the four-character sequence '{!r}', that 132 is expected to be replaced by repr(token.text). 133 """ 134 135 Token = Token_ 136 PP_TOKEN_RE = PP_TOKEN_RE_ 137 ENDLINE_RE = ENDLINE_RE_ 138 HEADER_NAME_RE = HEADER_NAME_RE_ 139 140 line_num = 1 141 line_start = 0 142 pos = 0 143 limit = len(file_contents) 144 directive = None 145 at_bol = True 146 while pos < limit: 147 if directive == "include": 148 mo = HEADER_NAME_RE.match(file_contents, pos) 149 if mo: 150 kind = "HEADER_NAME" 151 directive = "after_include" 152 else: 153 mo = PP_TOKEN_RE.match(file_contents, pos) 154 kind = mo.lastgroup 155 if kind != "WHITESPACE": 156 directive = "after_include" 157 else: 158 mo = PP_TOKEN_RE.match(file_contents, pos) 159 kind = mo.lastgroup 160 161 text = mo.group() 162 line = line_num 163 column = mo.start() - line_start 164 adj_line_start = 0 165 # only these kinds can contain a newline 166 if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT", 167 "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"): 168 for tmo in ENDLINE_RE.finditer(text): 169 line_num += 1 170 adj_line_start = tmo.end() 171 if adj_line_start: 172 line_start = mo.start() + adj_line_start 173 174 # Track whether or not we are scanning a preprocessing directive. 175 if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start): 176 at_bol = True 177 directive = None 178 else: 179 if kind == "PUNCTUATOR" and text == "#" and at_bol: 180 directive = "<null>" 181 elif kind == "IDENT" and directive == "<null>": 182 directive = text 183 at_bol = False 184 185 # Report ill-formed tokens and rewrite them as their well-formed 186 # equivalents, so downstream processing doesn't have to know about them. 187 # (Rewriting instead of discarding provides better error recovery.) 188 if kind == "BAD_BLOCK_COM": 189 reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""), 190 "unclosed block comment") 191 text += "*/" 192 kind = "BLOCK_COMMENT" 193 elif kind == "BAD_STRING": 194 reporter.error(Token("BAD_STRING", "", line, column+1, ""), 195 "unclosed string") 196 text += "\"" 197 kind = "STRING" 198 elif kind == "BAD_CHARCONST": 199 reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""), 200 "unclosed char constant") 201 text += "'" 202 kind = "CHARCONST" 203 204 tok = Token(kind, text, line, column+1, 205 "include" if directive == "after_include" else directive) 206 # Do not complain about OTHER tokens inside macro definitions. 207 # $ and @ appear in macros defined by headers intended to be 208 # included from assembly language, e.g. sysdeps/mips/sys/asm.h. 209 if kind == "OTHER" and directive != "define": 210 self.error(tok, "stray {!r} in program") 211 212 yield tok 213 pos = mo.end() 214 215# 216# Base and generic classes for individual checks. 217# 218 219class ConstructChecker: 220 """Scan a stream of C preprocessing tokens and possibly report 221 problems with them. The REPORTER object passed to __init__ has 222 one method, reporter.error(token, message), which should be 223 called to indicate a problem detected at the position of TOKEN. 224 If MESSAGE contains the four-character sequence '{!r}' then that 225 will be replaced with a textual representation of TOKEN. 226 """ 227 def __init__(self, reporter): 228 self.reporter = reporter 229 230 def examine(self, tok): 231 """Called once for each token in a header file. 232 Call self.reporter.error if a problem is detected. 233 """ 234 raise NotImplementedError 235 236 def eof(self): 237 """Called once at the end of the stream. Subclasses need only 238 override this if it might have something to do.""" 239 pass 240 241class NoCheck(ConstructChecker): 242 """Generic checker class which doesn't do anything. Substitute this 243 class for a real checker when a particular check should be skipped 244 for some file.""" 245 246 def examine(self, tok): 247 pass 248 249# 250# Check for obsolete type names. 251# 252 253# The obsolete type names we're looking for: 254OBSOLETE_TYPE_RE_ = re.compile(r"""\A 255 (__)? 256 ( quad_t 257 | u(?: short | int | long 258 | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t ))) 259\Z""", re.VERBOSE) 260 261class ObsoleteNotAllowed(ConstructChecker): 262 """Don't allow any use of the obsolete typedefs.""" 263 def examine(self, tok): 264 if OBSOLETE_TYPE_RE_.match(tok.text): 265 self.reporter.error(tok, "use of {!r}") 266 267class ObsoletePrivateDefinitionsAllowed(ConstructChecker): 268 """Allow definitions of the private versions of the 269 obsolete typedefs; that is, 'typedef [anything] __obsolete;' 270 """ 271 def __init__(self, reporter): 272 super().__init__(reporter) 273 self.in_typedef = False 274 self.prev_token = None 275 276 def examine(self, tok): 277 # bits/types.h hides 'typedef' in a macro sometimes. 278 if (tok.kind == "IDENT" 279 and tok.text in ("typedef", "__STD_TYPE") 280 and tok.context is None): 281 self.in_typedef = True 282 elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef: 283 self.in_typedef = False 284 if self.prev_token.kind == "IDENT": 285 m = OBSOLETE_TYPE_RE_.match(self.prev_token.text) 286 if m and m.group(1) != "__": 287 self.reporter.error(self.prev_token, "use of {!r}") 288 self.prev_token = None 289 else: 290 self._check_prev() 291 292 self.prev_token = tok 293 294 def eof(self): 295 self._check_prev() 296 297 def _check_prev(self): 298 if (self.prev_token is not None 299 and self.prev_token.kind == "IDENT" 300 and OBSOLETE_TYPE_RE_.match(self.prev_token.text)): 301 self.reporter.error(self.prev_token, "use of {!r}") 302 303class ObsoletePublicDefinitionsAllowed(ConstructChecker): 304 """Allow definitions of the public versions of the obsolete 305 typedefs. Only specific forms of definition are allowed: 306 307 typedef __obsolete obsolete; // identifiers must agree 308 typedef __uintN_t u_intN_t; // N must agree 309 typedef unsigned long int ulong; 310 typedef unsigned short int ushort; 311 typedef unsigned int uint; 312 """ 313 def __init__(self, reporter): 314 super().__init__(reporter) 315 self.typedef_tokens = [] 316 317 def examine(self, tok): 318 if tok.kind in ("WHITESPACE", "BLOCK_COMMENT", 319 "LINE_COMMENT", "NL", "ESCNL"): 320 pass 321 322 elif (tok.kind == "IDENT" and tok.text == "typedef" 323 and tok.context is None): 324 if self.typedef_tokens: 325 self.reporter.error(tok, "typedef inside typedef") 326 self._reset() 327 self.typedef_tokens.append(tok) 328 329 elif tok.kind == "PUNCTUATOR" and tok.text == ";": 330 self._finish() 331 332 elif self.typedef_tokens: 333 self.typedef_tokens.append(tok) 334 335 def eof(self): 336 self._reset() 337 338 def _reset(self): 339 while self.typedef_tokens: 340 tok = self.typedef_tokens.pop(0) 341 if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text): 342 self.reporter.error(tok, "use of {!r}") 343 344 def _finish(self): 345 if not self.typedef_tokens: return 346 if self.typedef_tokens[-1].kind == "IDENT": 347 m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text) 348 if m: 349 if self._permissible_public_definition(m): 350 self.typedef_tokens.clear() 351 self._reset() 352 353 def _permissible_public_definition(self, m): 354 if m.group(1) == "__": return False 355 name = m.group(2) 356 toks = self.typedef_tokens 357 ntok = len(toks) 358 if ntok == 3 and toks[1].kind == "IDENT": 359 defn = toks[1].text 360 n = OBSOLETE_TYPE_RE_.match(defn) 361 if n and n.group(1) == "__" and n.group(2) == name: 362 return True 363 364 if (name[:5] == "u_int" and name[-2:] == "_t" 365 and defn[:6] == "__uint" and defn[-2:] == "_t" 366 and name[5:-2] == defn[6:-2]): 367 return True 368 369 return False 370 371 if (name == "ulong" and ntok == 5 372 and toks[1].kind == "IDENT" and toks[1].text == "unsigned" 373 and toks[2].kind == "IDENT" and toks[2].text == "long" 374 and toks[3].kind == "IDENT" and toks[3].text == "int"): 375 return True 376 377 if (name == "ushort" and ntok == 5 378 and toks[1].kind == "IDENT" and toks[1].text == "unsigned" 379 and toks[2].kind == "IDENT" and toks[2].text == "short" 380 and toks[3].kind == "IDENT" and toks[3].text == "int"): 381 return True 382 383 if (name == "uint" and ntok == 4 384 and toks[1].kind == "IDENT" and toks[1].text == "unsigned" 385 and toks[2].kind == "IDENT" and toks[2].text == "int"): 386 return True 387 388 return False 389 390def ObsoleteTypedefChecker(reporter, fname): 391 """Factory: produce an instance of the appropriate 392 obsolete-typedef checker for FNAME.""" 393 394 # The obsolete rpc/ and rpcsvc/ headers are allowed to use the 395 # obsolete types, because it would be more trouble than it's 396 # worth to remove them from headers that we intend to stop 397 # installing eventually anyway. 398 if (fname.startswith("rpc/") 399 or fname.startswith("rpcsvc/") 400 or "/rpc/" in fname 401 or "/rpcsvc/" in fname): 402 return NoCheck(reporter) 403 404 # bits/types.h is allowed to define the __-versions of the 405 # obsolete types. 406 if (fname == "bits/types.h" 407 or fname.endswith("/bits/types.h")): 408 return ObsoletePrivateDefinitionsAllowed(reporter) 409 410 # sys/types.h is allowed to use the __-versions of the 411 # obsolete types, but only to define the unprefixed versions. 412 if (fname == "sys/types.h" 413 or fname.endswith("/sys/types.h")): 414 return ObsoletePublicDefinitionsAllowed(reporter) 415 416 return ObsoleteNotAllowed(reporter) 417 418# 419# Master control 420# 421 422class HeaderChecker: 423 """Perform all of the checks on each header. This is also the 424 "reporter" object expected by tokenize_c and ConstructChecker. 425 """ 426 def __init__(self): 427 self.fname = None 428 self.status = 0 429 430 def error(self, tok, message): 431 self.status = 1 432 if '{!r}' in message: 433 message = message.format(tok.text) 434 sys.stderr.write("{}:{}:{}: error: {}\n".format( 435 self.fname, tok.line, tok.column, message)) 436 437 def check(self, fname): 438 self.fname = fname 439 try: 440 with open(fname, "rt", encoding="utf-8") as fp: 441 contents = fp.read() 442 except OSError as e: 443 sys.stderr.write("{}: {}\n".format(fname, e.strerror)) 444 self.status = 1 445 return 446 447 typedef_checker = ObsoleteTypedefChecker(self, self.fname) 448 449 for tok in tokenize_c(contents, self): 450 typedef_checker.examine(tok) 451 452def main(): 453 ap = argparse.ArgumentParser(description=__doc__) 454 ap.add_argument("headers", metavar="header", nargs="+", 455 help="one or more headers to scan for obsolete constructs") 456 args = ap.parse_args() 457 458 checker = HeaderChecker() 459 for fname in args.headers: 460 # Headers whose installed name begins with "finclude/" contain 461 # Fortran, not C, and this program should completely ignore them. 462 if not (fname.startswith("finclude/") or "/finclude/" in fname): 463 checker.check(fname) 464 sys.exit(checker.status) 465 466main() 467