This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Improved check-localedef script


Here is an improved version of the check-localedef script I posted the
other week.  It now takes only about 1.5 seconds to process all the
files in localedata/locales/ (instead of seven seconds with the old
parser), which is fast enough that I think it would be reasonable to
run it during 'make check'.  Also, many bugs have been fixed.
Especially, the "can we encode this string in the charset that the
file is annotated with" test now actually _runs_...

... and finds dozens and dozens of errors. The full list is attached,
but here's a small sample:

localedata/locales/ur_PK... (charset: cp1256)
  localedata/locales/ur_PK:114: string not representable in cp1256:
      062C 0646 0648 0631 06CC
  localedata/locales/ur_PK:115: string not representable in cp1256:
      0641 0631 0648 0631 06CC
  localedata/locales/ur_PK:117: string not representable in cp1256:
      0627 067E 0631 06CC 0644

These are the abmon strings, so I think it really would be a problem...

zw
#!/usr/bin/python3
# Validate locale definitions.
# Copyright (C) 2017 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.

"""Validate locale definition files in ways that are too complicated
or too expensive to code into localedef.  This script is run over all
locale definitions as part of 'make check', when Python 3 is available.

Currently this performs two checks on each string within each file on
the command line: it must be in either Unicode NFC or NFD (we don't
care which), and it must be representable in the legacy character
set(s) declared in an annotation (e.g. % Charset: ISO-8859-5, KOI8-R).

It also performs several checks on the overall syntax of the file:

Outside of comments, the only characters allowed are the ASCII graphic
characters (U+0021 through U+007E inclusive), U+0020 SPACE, U+0009
HORIZONTAL TAB, and U+000A NEW LINE; in particular, the other
characters counted as "whitespace" in the POSIX locale are NOT
allowed.  Inside comments, this rule is relaxed to permit most Unicode
characters (see INAPPROPRIATE_UNICHARS); we might in the future start
allowing "raw" Unicode text in strings as well.

Byte escapes (/xxx, where / is the escape character) are only to be
used to escape newline, ", <, >, and the escape character itself. All
other characters that can't be written directly should be written as
<Unnnn> instead.

The escape_char and comment_char directives' arguments are
sanity-checked: both take a single character, which must be an ASCII
graphic character and may not be any of , ; < > ".  Finally, the
escape character and the comment character may not be the same.

"..." strings and <...> symbols must be properly closed before the end
of the line.  Hard tabs are not permitted inside strings (write
<U0009> if you really mean to put a tab inside a string) and if
escape-newline is used to continue a string onto the next line, the
first character on the next line may not be a space (write <U0020> if
you really mean to do that).

"""

import argparse
import codecs
import contextlib
import functools
import itertools
import re
import sys
import unicodedata

class ErrorLogger:
    """Object responsible for all error message output; keeps track of
       things like the file currently being processed, and whether any
       errors have so far been encountered."""
    def __init__(self, ofp, verbose):
        self.ofp     = ofp
        self.verbose = verbose
        self.status  = 0
        self.fname   = None
        self.fstatus = 0
        self.tblib   = None
        self.twlib   = None

    def begin_file(self, fname):
        self.fname   = fname
        self.fstatus = 0
        if self.verbose:
            self.ofp.write(self.fname)
            self.ofp.write("...")

    def end_file(self):
        if self.fstatus:
            self.status = 1
        elif self.verbose:
            self.ofp.write(" OK\n")

    def error(self, lno, message, *args):
        if self.verbose:
            if self.fstatus == 0:
                self.ofp.write("\n")
            self.ofp.write("  ")
        if args:
            message = message.format(*args)
        self.ofp.write("{}:{}: {}\n".format(self.fname, lno, message))

        self.fstatus = 1

    def oserror(self, filename, errmsg):
        # If all these things are true, the last thing printed was the
        # filename that provoked an OS error (e.g. we failed to open the
        # file we're logging for) so just print the error message.
        if self.verbose and self.fname == filename and self.fstatus == 0:
            self.ofp.write(errmsg)
            self.ofp.write("\n")
        else:
            if self.verbose:
                if self.fstatus == 0:
                    self.ofp.write("\n")
                self.ofp.write("  ")
            self.ofp.write("{}: {}\n".format(filename, errmsg))

        self.fstatus = 1

    def exception(self):
        exi = sys.exc_info()

        # The traceback module is lazily loaded since this method should
        # only need to be called if there's a bug in this program.
        if self.tblib is None:
            import traceback
            self.tblib = traceback

        if self.verbose:
            if self.fstatus == 0:
                self.ofp.write("\n")
            prefix = "  "
        else:
            prefix = ""
            self.ofp.write("{}: error:\n".format(self.fname))

        for msg in self.tblib.format_exception(*exi):
            for m in msg.split("\n"):
                if m:
                    self.ofp.write(prefix)
                    self.ofp.write(m)
                    self.ofp.write("\n")

        self.fstatus = 1

    def dump_codepoints(self, label, s):

        # The textwrap module is lazily loaded since this method should
        # only need to be called if there's a problem with the locale data.
        if self.twlib is None:
            import textwrap
            self.twlib = textwrap

        codepoints = [ord(c) for c in s]
        if any(c > 0xFFFF for c in codepoints):
            form = "06X"
        else:
            form = "04X"
        dumped = " ".join(format(c, form) for c in codepoints)
        if self.verbose:
            label = "  " + label
        self.ofp.write(self.twlib.fill(dumped, width=78,
                                       initial_indent=label,
                                       subsequent_indent=" "*len(label)))
        self.ofp.write("\n")

@contextlib.contextmanager
def logging_for_file(log, fname):
    try:
        log.begin_file(fname)
        yield
    except OSError as e:
        log.oserror(e.filename, e.strerror)
    except Exception:
        log.exception()
    finally:
        log.end_file()

# A strict definition of 'inappropriate character', currently used
# everywhere except comments: all characters _except_ the ASCII
# graphic characters, space, tab, and newline.
class InappropriateASCII:
    def __contains__(self, c):
        return not ("!" <= c <= "~" or c in (" ", "\t", "\n"))
INAPPROPRIATE_ASCII = InappropriateASCII()

# A relaxed definition of 'inappropriate character', currently used in
# comments only: arbitary Unicode characters are allowed, but not
# the legacy control characters (except TAB), nor the Unicode NIH
# line-breaking characters, nor bare surrogates, nor noncharacters.
# Private-use, not-yet-assigned, and format controls (Cf) are fine,
# except that BYTE ORDER MARK (U+FEFF) is not allowed.  OBJECT
# REPLACEMENT CHARACTER (U+FFFC) and REPLACEMENT CHARACTER (U+FFFD)
# are officially "symbols", but we weed them out as well, because
# their presence in a locale file means something has gone wrong
# somewhere.
INAPPROPRIATE_UNICODE = frozenset(chr(c) for c in itertools.chain(
    range(0x0000, 0x0009),
    range(0x000A, 0x0020),
    range(0x007F, 0x00A0),
    range(0xD800, 0xE000),
    range(0xFDD0, 0xFDF0),
    (i * 0x10000 + 0xFFFE for i in range(0x11)),
    (i * 0x10000 + 0xFFFF for i in range(0x11)),
    (0x2028, 0x2029, 0xFEFF, 0xFFFC, 0xFFFD)
))

@functools.lru_cache(maxsize=32)
def compile_token_re(escape_char, comment_char):

    graphic = set(chr(c) for c in range(0x21, 0x7F))
    special = { escape_char, comment_char, ',', ';', '<', '>', '"' }
    wordchar = ''.join((x if x not in ('-', '\\', ']') else '\\' + x)
                       for x in sorted(graphic - special))

    # Note: POSIX specifically says that comments are _not_ continued
    # onto the next line by the escape_char.
    abstract_token_re = r"""(?msx)
             (?P<COMMA>    ,                        )
      |      (?P<SEMI>     ;                        )
      |      (?P<NEWLINE>  \n                       )
      |      (?P<WHITE>    [ \t]+                   )
      |      (?P<WORD>     (?:[{wordchar}]|{ec}.)+  )
      | "    (?P<STRING>   (?:[^"\n{ec}]|{ec}.)*    ) (?:"|$)
      | <    (?P<SYMBOL>   (?:[^>\n{ec}]|{ec}.)*    ) (?:>|$)
      | {cc} (?P<COMMENT>  [^\n]*                   )
      |      (?P<BAD>      .                        )
    """

    return re.compile(abstract_token_re.format(
        wordchar = wordchar,
        ec       = re.escape(escape_char),
        cc       = re.escape(comment_char)))

@functools.lru_cache(maxsize=32)
def compile_esc_re(escape_char):
    return re.compile(r"""(?six)
        {ec} (?P<ESC> [0-7]{{1,3}} | d[0-9]{{1,3}} | x[0-9a-f]{{1,2}} | . )
      |   <U (?P<UNI> [0-9a-f]{{1,8}} ) >
    """.format(ec=re.escape(escape_char)))

directive_re = re.compile(
    r"[ \t]*(comment|escape)_char[ \t]*([^\n\t ]*)(?:[ \t][^\n]*)?(\n|\Z)")

charset_re = re.compile(r"(?i)[ \t]*charset:[ \t]*(.+)$")
charset_split_re = re.compile(r"[,; \t][ \t]*")

def add_charsets(line, lno, charsets, log):
    m = charset_re.match(line)
    if not m:
        return

    for cs in charset_split_re.split(m.group(1)):
        try:
            co = codecs.lookup(cs)
            if co.name not in charsets:
                charsets[co.name] = co

        except LookupError:
            log.error(lno, "unknown charset {!r}", cs)

def scan_localedef(fp, log):
    """Scan through a locale definition file, FP.  Returns a list of
       all strings appearing in the file, as 2-tuples (lno, string),
       and a dict containing codecs for all the character-set annotations.
       May also emit error messages.
       Assumes that log.begin_file() has been called for the file FP.
    """
    strings = []
    charsets = {}
    escape_char = '\\'
    comment_char = '#'
    lno = 1
    data = fp.read()

    def decode_and_diagnose_esc(m):
        g = m.lastgroup
        c = m.group(g)
        if g == "UNI":
            return chr(int(c, 16))
        else:
            if c == '\n':
                # Look one past the end of the match.  Is it whitespace?
                loc = m.end(g)
                if len(m.string) > loc and m.string[loc] in " \t":
                    log.error(lno, "leading whitespace in string "
                              "after escaped newline")
                return ''
            if c not in '<>"' and c != escape_char:
                log.error(lno, "inappropriate escape sequence '{}'",
                          m.group(0))

            if len(c) == 1 and c not in "01234567":
                return c

            p = c[0]
            if p in ('d', 'D'):
                base = 10
                c = c[1:]
            elif p in ('x', 'X'):
                base = 16
                c = c[1:]
            else:
                base = 8
            return chr(int(c, base))


    # We only recognize the 'escape_char' and 'comment_char' directives
    # if they appear (in either order) on the very first one or two lines
    # in the file.
    for _ in range(2):
        m = directive_re.match(data)
        if not m: break

        if m.group(3) == '\n': lno += 1
        data = data[m.end():]
        which = m.group(1)
        arg = m.group(2)
        if len(arg) == 0:
            log.error(lno, "missing argument to {}_char directive", which)
        elif len(arg) != 1:
            log.error(lno, "argument to {}_char must be a single character",
                      which)
        elif not ("!" <= arg <= "~" and arg not in ',;<>"'):
            log.error(lno, "{}_char may not be set to {!r}", which, arg)
        elif which == "comment":
            comment_char = m.group(2)
        else:
            escape_char = m.group(2)

    if comment_char == escape_char:
        log.error("comment_char and escape_char both set to {}", comment_char)
        escape_char = '\\'
        comment_char = '#'

    token_re = compile_token_re(escape_char, comment_char)
    esc_re = compile_esc_re(escape_char)

    for m in token_re.finditer(data):
        kind = m.lastgroup

        if kind == "NEWLINE":
            lno += 1

        elif kind == "COMMENT":
            add_charsets(m.group(kind), lno, charsets, log)

        elif kind == "WORD":
            value = m.group(kind)
            if value == "comment_char" or value == "escape_char":
                log.error(lno,
                          "{} directive must be at the top of the file",
                          value)

            # Run this for its diagnostic output only.
            for xm in esc_re.finditer(value):
                decode_and_diagnose_esc(xm)

            lno += value.count('\n')

        elif kind == "SYMBOL":
            value = m.group(kind)

            # Check for close quote.
            end = m.end(kind)
            if len(data) == end or data[end] != '>':
                log.error(lno, "missing close '>' character")

            # Run this for its diagnostic output only.
            for xm in esc_re.finditer(value):
                decode_and_diagnose_esc(xm)

            lno += value.count('\n')

        elif kind == "STRING":
            value = m.group(kind)

            # Diagnose hard tabs (not written as <U0009>) embedded in
            # strings.
            if "\t" in value:
                log.error(lno, "hard tab character in string")

            # Check for close quote.
            end = m.end(kind)
            if len(data) == end or data[end] != '"':
                log.error(lno, "missing close '\"' character")

            s = esc_re.sub(decode_and_diagnose_esc, value)
            if s:
                strings.append((lno, s))

            lno += value.count('\n')

        #else: other token types are currently ignored

    return strings, charsets


unicode_symbol_re = re.compile("(?i)<U([0-9a-f]+)>")
def decode_unicode_symbols(s, lno, log):
    """Convert <Uxxxx> tokens to the corresponding characters.
       Other symbolic names are left untouched."""
    try:
        return unicode_symbol_re.sub(lambda c: chr(int(c.group(1), 16)), s)
    except (UnicodeError, ValueError) as e:
        log.error("invalid <Uxxxx> token in string: {}", str(e))

def process(fp, log):
    strings, charsets = scan_localedef(fp, log)

    if log.verbose and not log.fstatus and charsets:
        log.ofp.write(" (charset{}: {})".format(
            "s" if len(charsets) > 1 else "",
            " ".join(sorted(charsets.keys()))))

    for lno, s in strings:
        nfc_s = unicodedata.normalize("NFC", s)
        nfd_s = unicodedata.normalize("NFD", s)
        if s != nfd_s and s != nfc_s:
            log.error(lno, "string not normalized:")
            log.dump_codepoints("  source: ", s)
            if nfc_s == nfd_s:
                log.dump_codepoints("  nf[cd]: ", nfc_s)
            else:
                log.dump_codepoints("     nfc: ", nfc_s)
                log.dump_codepoints("     nfd: ", nfd_s)

        for charset, codec in sorted(charsets.items()):
            # It's not necessary to do this test for UTF-8.
            if charset != "utf-8":
                try:
                    _ = codec.encode(s)
                except UnicodeEncodeError:
                    log.error(lno, "string not representable in {}:", charset)
                    log.dump_codepoints("    ", s)

def process_files(args):
    logger = ErrorLogger(sys.stderr, args.verbose)

    for f in args.files:
        with logging_for_file(logger, f), \
             open(f, "rt", encoding=args.encoding) as fp:
            process(fp, logger)

    return logger.status

def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("-v", "--verbose", action="store_true")
    ap.add_argument("-e", "--encoding", default="utf-8")
    ap.add_argument("files", nargs="+")
    args = ap.parse_args()
    sys.exit(process_files(args))

main()

Attachment: check-localedef.errs
Description: Binary data


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]