1#! /usr/bin/python3
2# Copyright (C) 2019-2022 Free Software Foundation, Inc.
3# This file is part of the GNU C Library.
4#
5# The GNU C Library is free software; you can redistribute it and/or
6# modify it under the terms of the GNU Lesser General Public
7# License as published by the Free Software Foundation; either
8# version 2.1 of the License, or (at your option) any later version.
9#
10# The GNU C Library is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13# Lesser General Public License for more details.
14#
15# You should have received a copy of the GNU Lesser General Public
16# License along with the GNU C Library; if not, see
17# <https://www.gnu.org/licenses/>.
18
19"""Verifies that installed headers do not use any obsolete constructs:
20 * legacy BSD typedefs superseded by <stdint.h>:
21   ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
22   (sys/types.h is allowed to _define_ these types, but not to use them
23    to define anything else).
24"""
25
26import argparse
27import collections
28import re
29import sys
30
31# Simplified lexical analyzer for C preprocessing tokens.
32# Does not implement trigraphs.
33# Does not implement backslash-newline in the middle of any lexical
34#   item other than a string literal.
35# Does not implement universal-character-names in identifiers.
36# Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
37# Accepts non-ASCII characters only within comments and strings.
38
39# Caution: The order of the outermost alternation matters.
40# STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
41# BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
42# be last.
43# Caution: There should be no capturing groups other than the named
44# captures in the outermost alternation.
45
46# For reference, these are all of the C punctuators as of C11:
47#   [ ] ( ) { } , ; ? ~
48#   ! != * *= / /= ^ ^= = ==
49#   # ##
50#   % %= %> %: %:%:
51#   & &= &&
52#   | |= ||
53#   + += ++
54#   - -= -- ->
55#   . ...
56#   : :>
57#   < <% <: << <<= <=
58#   > >= >> >>=
59
60# The BAD_* tokens are not part of the official definition of pp-tokens;
61# they match unclosed strings, character constants, and block comments,
62# so that the regex engine doesn't have to backtrack all the way to the
63# beginning of a broken construct and then emit dozens of junk tokens.
64
65PP_TOKEN_RE_ = re.compile(r"""
66    (?P<STRING>        \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\")
67   |(?P<BAD_STRING>    \"(?:[^\"\\\r\n]|\\[ -~])*)
68   |(?P<CHARCONST>     \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\')
69   |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*)
70   |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/)
71   |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?)
72   |(?P<LINE_COMMENT>  //[^\r\n]*)
73   |(?P<IDENT>         [_a-zA-Z][_a-zA-Z0-9]*)
74   |(?P<PP_NUMBER>     \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*)
75   |(?P<PUNCTUATOR>
76       [,;?~(){}\[\]]
77     | [!*/^=]=?
78     | \#\#?
79     | %(?:[=>]|:(?:%:)?)?
80     | &[=&]?
81     |\|[=|]?
82     |\+[=+]?
83     | -[=->]?
84     |\.(?:\.\.)?
85     | :>?
86     | <(?:[%:]|<(?:=|<=?)?)?
87     | >(?:=|>=?)?)
88   |(?P<ESCNL>         \\(?:\r|\n|\r\n))
89   |(?P<WHITESPACE>    [ \t\n\r\v\f]+)
90   |(?P<OTHER>         .)
91""", re.DOTALL | re.VERBOSE)
92
93HEADER_NAME_RE_ = re.compile(r"""
94    < [^>\r\n]+ >
95  | " [^"\r\n]+ "
96""", re.DOTALL | re.VERBOSE)
97
98ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")
99
100# based on the sample code in the Python re documentation
101Token_ = collections.namedtuple("Token", (
102    "kind", "text", "line", "column", "context"))
103Token_.__doc__ = """
104   One C preprocessing token, comment, or chunk of whitespace.
105   'kind' identifies the token type, which will be one of:
106       STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
107       PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
108       or OTHER.  The BAD_* alternatives in PP_TOKEN_RE_ are
109       handled within tokenize_c, below.
110
111   'text' is the sequence of source characters making up the token;
112       no decoding whatsoever is performed.
113
114   'line' and 'column' give the position of the first character of the
115      token within the source file.  They are both 1-based.
116
117   'context' indicates whether or not this token occurred within a
118      preprocessing directive; it will be None for running text,
119      '<null>' for the leading '#' of a directive line (because '#'
120      all by itself on a line is a "null directive"), or the name of
121      the directive for tokens within a directive line, starting with
122      the IDENT for the name itself.
123"""
124
125def tokenize_c(file_contents, reporter):
126    """Yield a series of Token objects, one for each preprocessing
127       token, comment, or chunk of whitespace within FILE_CONTENTS.
128       The REPORTER object is expected to have one method,
129       reporter.error(token, message), which will be called to
130       indicate a lexical error at the position of TOKEN.
131       If MESSAGE contains the four-character sequence '{!r}', that
132       is expected to be replaced by repr(token.text).
133    """
134
135    Token = Token_
136    PP_TOKEN_RE = PP_TOKEN_RE_
137    ENDLINE_RE = ENDLINE_RE_
138    HEADER_NAME_RE = HEADER_NAME_RE_
139
140    line_num = 1
141    line_start = 0
142    pos = 0
143    limit = len(file_contents)
144    directive = None
145    at_bol = True
146    while pos < limit:
147        if directive == "include":
148            mo = HEADER_NAME_RE.match(file_contents, pos)
149            if mo:
150                kind = "HEADER_NAME"
151                directive = "after_include"
152            else:
153                mo = PP_TOKEN_RE.match(file_contents, pos)
154                kind = mo.lastgroup
155                if kind != "WHITESPACE":
156                    directive = "after_include"
157        else:
158            mo = PP_TOKEN_RE.match(file_contents, pos)
159            kind = mo.lastgroup
160
161        text = mo.group()
162        line = line_num
163        column = mo.start() - line_start
164        adj_line_start = 0
165        # only these kinds can contain a newline
166        if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
167                    "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
168            for tmo in ENDLINE_RE.finditer(text):
169                line_num += 1
170                adj_line_start = tmo.end()
171            if adj_line_start:
172                line_start = mo.start() + adj_line_start
173
174        # Track whether or not we are scanning a preprocessing directive.
175        if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
176            at_bol = True
177            directive = None
178        else:
179            if kind == "PUNCTUATOR" and text == "#" and at_bol:
180                directive = "<null>"
181            elif kind == "IDENT" and directive == "<null>":
182                directive = text
183            at_bol = False
184
185        # Report ill-formed tokens and rewrite them as their well-formed
186        # equivalents, so downstream processing doesn't have to know about them.
187        # (Rewriting instead of discarding provides better error recovery.)
188        if kind == "BAD_BLOCK_COM":
189            reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
190                           "unclosed block comment")
191            text += "*/"
192            kind = "BLOCK_COMMENT"
193        elif kind == "BAD_STRING":
194            reporter.error(Token("BAD_STRING", "", line, column+1, ""),
195                           "unclosed string")
196            text += "\""
197            kind = "STRING"
198        elif kind == "BAD_CHARCONST":
199            reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
200                           "unclosed char constant")
201            text += "'"
202            kind = "CHARCONST"
203
204        tok = Token(kind, text, line, column+1,
205                    "include" if directive == "after_include" else directive)
206        # Do not complain about OTHER tokens inside macro definitions.
207        # $ and @ appear in macros defined by headers intended to be
208        # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
209        if kind == "OTHER" and directive != "define":
210            self.error(tok, "stray {!r} in program")
211
212        yield tok
213        pos = mo.end()
214
215#
216# Base and generic classes for individual checks.
217#
218
219class ConstructChecker:
220    """Scan a stream of C preprocessing tokens and possibly report
221       problems with them.  The REPORTER object passed to __init__ has
222       one method, reporter.error(token, message), which should be
223       called to indicate a problem detected at the position of TOKEN.
224       If MESSAGE contains the four-character sequence '{!r}' then that
225       will be replaced with a textual representation of TOKEN.
226    """
227    def __init__(self, reporter):
228        self.reporter = reporter
229
230    def examine(self, tok):
231        """Called once for each token in a header file.
232           Call self.reporter.error if a problem is detected.
233        """
234        raise NotImplementedError
235
236    def eof(self):
237        """Called once at the end of the stream.  Subclasses need only
238           override this if it might have something to do."""
239        pass
240
241class NoCheck(ConstructChecker):
242    """Generic checker class which doesn't do anything.  Substitute this
243       class for a real checker when a particular check should be skipped
244       for some file."""
245
246    def examine(self, tok):
247        pass
248
249#
250# Check for obsolete type names.
251#
252
253# The obsolete type names we're looking for:
254OBSOLETE_TYPE_RE_ = re.compile(r"""\A
255  (__)?
256  (   quad_t
257    | u(?: short | int | long
258         | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t )))
259\Z""", re.VERBOSE)
260
261class ObsoleteNotAllowed(ConstructChecker):
262    """Don't allow any use of the obsolete typedefs."""
263    def examine(self, tok):
264        if OBSOLETE_TYPE_RE_.match(tok.text):
265            self.reporter.error(tok, "use of {!r}")
266
267class ObsoletePrivateDefinitionsAllowed(ConstructChecker):
268    """Allow definitions of the private versions of the
269       obsolete typedefs; that is, 'typedef [anything] __obsolete;'
270    """
271    def __init__(self, reporter):
272        super().__init__(reporter)
273        self.in_typedef = False
274        self.prev_token = None
275
276    def examine(self, tok):
277        # bits/types.h hides 'typedef' in a macro sometimes.
278        if (tok.kind == "IDENT"
279            and tok.text in ("typedef", "__STD_TYPE")
280            and tok.context is None):
281            self.in_typedef = True
282        elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef:
283            self.in_typedef = False
284            if self.prev_token.kind == "IDENT":
285                m = OBSOLETE_TYPE_RE_.match(self.prev_token.text)
286                if m and m.group(1) != "__":
287                    self.reporter.error(self.prev_token, "use of {!r}")
288            self.prev_token = None
289        else:
290            self._check_prev()
291
292        self.prev_token = tok
293
294    def eof(self):
295        self._check_prev()
296
297    def _check_prev(self):
298        if (self.prev_token is not None
299            and self.prev_token.kind == "IDENT"
300            and OBSOLETE_TYPE_RE_.match(self.prev_token.text)):
301            self.reporter.error(self.prev_token, "use of {!r}")
302
303class ObsoletePublicDefinitionsAllowed(ConstructChecker):
304    """Allow definitions of the public versions of the obsolete
305       typedefs.  Only specific forms of definition are allowed:
306
307           typedef __obsolete obsolete;  // identifiers must agree
308           typedef __uintN_t u_intN_t;   // N must agree
309           typedef unsigned long int ulong;
310           typedef unsigned short int ushort;
311           typedef unsigned int uint;
312    """
313    def __init__(self, reporter):
314        super().__init__(reporter)
315        self.typedef_tokens = []
316
317    def examine(self, tok):
318        if tok.kind in ("WHITESPACE", "BLOCK_COMMENT",
319                        "LINE_COMMENT", "NL", "ESCNL"):
320            pass
321
322        elif (tok.kind == "IDENT" and tok.text == "typedef"
323              and tok.context is None):
324            if self.typedef_tokens:
325                self.reporter.error(tok, "typedef inside typedef")
326                self._reset()
327            self.typedef_tokens.append(tok)
328
329        elif tok.kind == "PUNCTUATOR" and tok.text == ";":
330            self._finish()
331
332        elif self.typedef_tokens:
333            self.typedef_tokens.append(tok)
334
335    def eof(self):
336        self._reset()
337
338    def _reset(self):
339        while self.typedef_tokens:
340            tok = self.typedef_tokens.pop(0)
341            if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text):
342                self.reporter.error(tok, "use of {!r}")
343
344    def _finish(self):
345        if not self.typedef_tokens: return
346        if self.typedef_tokens[-1].kind == "IDENT":
347            m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text)
348            if m:
349                if self._permissible_public_definition(m):
350                    self.typedef_tokens.clear()
351        self._reset()
352
353    def _permissible_public_definition(self, m):
354        if m.group(1) == "__": return False
355        name = m.group(2)
356        toks = self.typedef_tokens
357        ntok = len(toks)
358        if ntok == 3 and toks[1].kind == "IDENT":
359            defn = toks[1].text
360            n = OBSOLETE_TYPE_RE_.match(defn)
361            if n and n.group(1) == "__" and n.group(2) == name:
362                return True
363
364            if (name[:5] == "u_int" and name[-2:] == "_t"
365                and defn[:6] == "__uint" and defn[-2:] == "_t"
366                and name[5:-2] == defn[6:-2]):
367                return True
368
369            return False
370
371        if (name == "ulong" and ntok == 5
372            and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
373            and toks[2].kind == "IDENT" and toks[2].text == "long"
374            and toks[3].kind == "IDENT" and toks[3].text == "int"):
375            return True
376
377        if (name == "ushort" and ntok == 5
378            and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
379            and toks[2].kind == "IDENT" and toks[2].text == "short"
380            and toks[3].kind == "IDENT" and toks[3].text == "int"):
381            return True
382
383        if (name == "uint" and ntok == 4
384            and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
385            and toks[2].kind == "IDENT" and toks[2].text == "int"):
386            return True
387
388        return False
389
390def ObsoleteTypedefChecker(reporter, fname):
391    """Factory: produce an instance of the appropriate
392       obsolete-typedef checker for FNAME."""
393
394    # The obsolete rpc/ and rpcsvc/ headers are allowed to use the
395    # obsolete types, because it would be more trouble than it's
396    # worth to remove them from headers that we intend to stop
397    # installing eventually anyway.
398    if (fname.startswith("rpc/")
399        or fname.startswith("rpcsvc/")
400        or "/rpc/" in fname
401        or "/rpcsvc/" in fname):
402        return NoCheck(reporter)
403
404    # bits/types.h is allowed to define the __-versions of the
405    # obsolete types.
406    if (fname == "bits/types.h"
407        or fname.endswith("/bits/types.h")):
408        return ObsoletePrivateDefinitionsAllowed(reporter)
409
410    # sys/types.h is allowed to use the __-versions of the
411    # obsolete types, but only to define the unprefixed versions.
412    if (fname == "sys/types.h"
413        or fname.endswith("/sys/types.h")):
414        return ObsoletePublicDefinitionsAllowed(reporter)
415
416    return ObsoleteNotAllowed(reporter)
417
418#
419# Master control
420#
421
422class HeaderChecker:
423    """Perform all of the checks on each header.  This is also the
424       "reporter" object expected by tokenize_c and ConstructChecker.
425    """
426    def __init__(self):
427        self.fname = None
428        self.status = 0
429
430    def error(self, tok, message):
431        self.status = 1
432        if '{!r}' in message:
433            message = message.format(tok.text)
434        sys.stderr.write("{}:{}:{}: error: {}\n".format(
435            self.fname, tok.line, tok.column, message))
436
437    def check(self, fname):
438        self.fname = fname
439        try:
440            with open(fname, "rt", encoding="utf-8") as fp:
441                contents = fp.read()
442        except OSError as e:
443            sys.stderr.write("{}: {}\n".format(fname, e.strerror))
444            self.status = 1
445            return
446
447        typedef_checker = ObsoleteTypedefChecker(self, self.fname)
448
449        for tok in tokenize_c(contents, self):
450            typedef_checker.examine(tok)
451
452def main():
453    ap = argparse.ArgumentParser(description=__doc__)
454    ap.add_argument("headers", metavar="header", nargs="+",
455                    help="one or more headers to scan for obsolete constructs")
456    args = ap.parse_args()
457
458    checker = HeaderChecker()
459    for fname in args.headers:
460        # Headers whose installed name begins with "finclude/" contain
461        # Fortran, not C, and this program should completely ignore them.
462        if not (fname.startswith("finclude/") or "/finclude/" in fname):
463            checker.check(fname)
464    sys.exit(checker.status)
465
466main()
467