This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

localedata linting revised again


I've revised my localedata linter to use iconv instead of python's
built-in codecs, and to only complain about strings being
unrepresentable if transliteration doesn't help.

All of the remaining complaints are about strings that aren't NFC
(full list at bottom of this message).  Most, but not all, of these
appear to be LC_COLLATE specifications for decomposed accented
characters, which I would have expected to be handled generically for
all languages (if there is a canonical equivalence between two
codepoint sequences, then it seems intuitively obvious to me that they
should always be treated the same for collation, perhaps with the
actual code points used as a tiebreaker).  But given the contents of
the various files, apparently it isn't, and I think that's a bug.

zw

---

localedata/locales/as_IN:120: string not normalized:
  source: 09B9 09DF
     nfc: 09B9 09AF 09BC
localedata/locales/as_IN:121: string not normalized:
  source: 09A8 09B9 09DF
     nfc: 09A8 09B9 09AF 09BC
localedata/locales/da_DK:144: string not normalized:
  source: 0041 0308
     nfc: 00C4
localedata/locales/da_DK:146: string not normalized:
  source: 0041 030A
     nfc: 00C5
localedata/locales/da_DK:150: string not normalized:
  source: 004F 0308
     nfc: 00D6
localedata/locales/da_DK:154: string not normalized:
  source: 0061 0308
     nfc: 00E4
localedata/locales/da_DK:156: string not normalized:
  source: 0061 030A
     nfc: 00E5
localedata/locales/da_DK:160: string not normalized:
  source: 006F 0308
     nfc: 00F6
localedata/locales/de_DE:50: string not normalized:
  source: 0041 0308
     nfc: 00C4
localedata/locales/de_DE:52: string not normalized:
  source: 004F 0308
     nfc: 00D6
localedata/locales/de_DE:54: string not normalized:
  source: 0055 0308
     nfc: 00DC
localedata/locales/de_DE:56: string not normalized:
  source: 0061 0308
     nfc: 00E4
localedata/locales/de_DE:58: string not normalized:
  source: 006F 0308
     nfc: 00F6
localedata/locales/de_DE:60: string not normalized:
  source: 0075 0308
     nfc: 00FC
localedata/locales/de_DE:64: string not normalized:
  source: 0041 030A
     nfc: 00C5
localedata/locales/de_DE:66: string not normalized:
  source: 0061 030A
     nfc: 00E5
localedata/locales/fa_IR:106: string not normalized:
  source: 0627 0653
     nfc: 0622
localedata/locales/fa_IR:107: string not normalized:
  source: 0627 0654
     nfc: 0623
localedata/locales/fa_IR:108: string not normalized:
  source: 0627 0655
     nfc: 0625
localedata/locales/fa_IR:109: string not normalized:
  source: 0648 0654
     nfc: 0624
localedata/locales/fa_IR:111: string not normalized:
  source: 064A 0654
     nfc: 0626
localedata/locales/hu_HU:460: string not normalized:
  source: 0041 0301
     nfc: 00C1
localedata/locales/hu_HU:461: string not normalized:
  source: 0045 0301
     nfc: 00C9
localedata/locales/hu_HU:462: string not normalized:
  source: 0049 0301
     nfc: 00CD
localedata/locales/hu_HU:463: string not normalized:
  source: 004F 0301
     nfc: 00D3
localedata/locales/hu_HU:464: string not normalized:
  source: 004F 0308
     nfc: 00D6
localedata/locales/hu_HU:465: string not normalized:
  source: 004F 030B
     nfc: 0150
localedata/locales/hu_HU:466: string not normalized:
  source: 0055 0301
     nfc: 00DA
localedata/locales/hu_HU:467: string not normalized:
  source: 0055 0308
     nfc: 00DC
localedata/locales/hu_HU:468: string not normalized:
  source: 0055 030B
     nfc: 0170
localedata/locales/hu_HU:470: string not normalized:
  source: 0061 0301
     nfc: 00E1
localedata/locales/hu_HU:471: string not normalized:
  source: 0065 0301
     nfc: 00E9
localedata/locales/hu_HU:472: string not normalized:
  source: 0069 0301
     nfc: 00ED
localedata/locales/hu_HU:473: string not normalized:
  source: 006F 0301
     nfc: 00F3
localedata/locales/hu_HU:474: string not normalized:
  source: 006F 0308
     nfc: 00F6
localedata/locales/hu_HU:475: string not normalized:
  source: 006F 030B
     nfc: 0151
localedata/locales/hu_HU:476: string not normalized:
  source: 0075 0301
     nfc: 00FA
localedata/locales/hu_HU:477: string not normalized:
  source: 0075 0308
     nfc: 00FC
localedata/locales/hu_HU:478: string not normalized:
  source: 0075 030B
     nfc: 0171
localedata/locales/ig_NG:103: string not normalized:
  source: 0049 0323
     nfc: 1ECA
localedata/locales/ig_NG:104: string not normalized:
  source: 0069 0323
     nfc: 1ECB
localedata/locales/ig_NG:105: string not normalized:
  source: 0049 0323 0301
     nfc: 1ECA 0301
localedata/locales/ig_NG:106: string not normalized:
  source: 0069 0323 0301
     nfc: 1ECB 0301
localedata/locales/ig_NG:107: string not normalized:
  source: 0049 0323 0300
     nfc: 1ECA 0300
localedata/locales/ig_NG:108: string not normalized:
  source: 0069 0323 0300
     nfc: 1ECB 0300
localedata/locales/ig_NG:114: string not normalized:
  source: 004F 0323
     nfc: 1ECC
localedata/locales/ig_NG:115: string not normalized:
  source: 006F 0323
     nfc: 1ECD
localedata/locales/ig_NG:116: string not normalized:
  source: 004F 0323 0301
     nfc: 1ECC 0301
localedata/locales/ig_NG:117: string not normalized:
  source: 006F 0323 0301
     nfc: 1ECD 0301
localedata/locales/ig_NG:118: string not normalized:
  source: 004F 0323 0300
     nfc: 1ECC 0300
localedata/locales/ig_NG:119: string not normalized:
  source: 006F 0323 0300
     nfc: 1ECD 0300
localedata/locales/ig_NG:130: string not normalized:
  source: 0055 0323
     nfc: 1EE4
localedata/locales/ig_NG:131: string not normalized:
  source: 0075 0323
     nfc: 1EE5
localedata/locales/ig_NG:132: string not normalized:
  source: 0055 0323 0301
     nfc: 1EE4 0301
localedata/locales/ig_NG:133: string not normalized:
  source: 0075 0323 0301
     nfc: 1EE5 0301
localedata/locales/ig_NG:134: string not normalized:
  source: 0075 0323 0300
     nfc: 1EE5 0300
localedata/locales/ig_NG:135: string not normalized:
  source: 0055 0323 0300
     nfc: 1EE4 0300
localedata/locales/ig_NG:141: string not normalized:
  source: 004E 0307
     nfc: 1E44
localedata/locales/ig_NG:142: string not normalized:
  source: 006E 0307
     nfc: 1E45
localedata/locales/ig_NG:144: string not normalized:
  source: 0041 0301
     nfc: 00C1
localedata/locales/ig_NG:145: string not normalized:
  source: 0061 0301
     nfc: 00E1
localedata/locales/ig_NG:147: string not normalized:
  source: 0045 0301
     nfc: 00C9
localedata/locales/ig_NG:148: string not normalized:
  source: 0065 0301
     nfc: 00E9
localedata/locales/ig_NG:150: string not normalized:
  source: 0049 0301
     nfc: 00CD
localedata/locales/ig_NG:151: string not normalized:
  source: 0069 0301
     nfc: 00ED
localedata/locales/ig_NG:153: string not normalized:
  source: 004F 0301
     nfc: 00D3
localedata/locales/ig_NG:154: string not normalized:
  source: 006F 0301
     nfc: 00F3
localedata/locales/ig_NG:156: string not normalized:
  source: 0055 0301
     nfc: 00DA
localedata/locales/ig_NG:157: string not normalized:
  source: 0075 0301
     nfc: 00FA
localedata/locales/ig_NG:159: string not normalized:
  source: 0041 0300
     nfc: 00C0
localedata/locales/ig_NG:160: string not normalized:
  source: 0061 0300
     nfc: 00E0
localedata/locales/ig_NG:162: string not normalized:
  source: 0045 0300
     nfc: 00C8
localedata/locales/ig_NG:163: string not normalized:
  source: 0065 0300
     nfc: 00E8
localedata/locales/ig_NG:165: string not normalized:
  source: 0049 0300
     nfc: 00CC
localedata/locales/ig_NG:166: string not normalized:
  source: 0069 0300
     nfc: 00EC
localedata/locales/ig_NG:168: string not normalized:
  source: 004F 0300
     nfc: 00D2
localedata/locales/ig_NG:169: string not normalized:
  source: 006F 0300
     nfc: 00F2
localedata/locales/ig_NG:171: string not normalized:
  source: 0055 0300
     nfc: 00D9
localedata/locales/ig_NG:172: string not normalized:
  source: 0075 0300
     nfc: 00F9
localedata/locales/ik_CA:77: string not normalized:
  source: 004C 0323
     nfc: 1E36
localedata/locales/ik_CA:78: string not normalized:
  source: 006C 0323
     nfc: 1E37
localedata/locales/lb_LU:50: string not normalized:
  source: 0041 0308
     nfc: 00C4
localedata/locales/lb_LU:52: string not normalized:
  source: 004F 0308
     nfc: 00D6
localedata/locales/lb_LU:54: string not normalized:
  source: 0055 0308
     nfc: 00DC
localedata/locales/lb_LU:56: string not normalized:
  source: 0045 0308
     nfc: 00CB
localedata/locales/lb_LU:58: string not normalized:
  source: 0061 0308
     nfc: 00E4
localedata/locales/lb_LU:60: string not normalized:
  source: 006F 0308
     nfc: 00F6
localedata/locales/lb_LU:62: string not normalized:
  source: 0075 0308
     nfc: 00FC
localedata/locales/lb_LU:64: string not normalized:
  source: 0065 0308
     nfc: 00EB
localedata/locales/mni_IN:67: string not normalized:
  source: 09DF 09C1 09AE
     nfc: 09AF 09BC 09C1 09AE
localedata/locales/mni_IN:76: string not normalized:
  source: 09DF 09C1 09AE 09B6 0995 09C8 09B6 09BE
     nfc: 09AF 09BC 09C1 09AE 09B6 0995 09C8 09B6 09BE
localedata/locales/nb_NO:137: string not normalized:
  source: 0041 0308
     nfc: 00C4
localedata/locales/nb_NO:139: string not normalized:
  source: 0041 030A
     nfc: 00C5
localedata/locales/nb_NO:143: string not normalized:
  source: 004F 0308
     nfc: 00D6
localedata/locales/nb_NO:147: string not normalized:
  source: 0061 0308
     nfc: 00E4
localedata/locales/nb_NO:149: string not normalized:
  source: 0061 030A
     nfc: 00E5
localedata/locales/nb_NO:153: string not normalized:
  source: 006F 0308
     nfc: 00F6
localedata/locales/pa_IN:95: string not normalized:
  source: 0A36 0A41 0A71 0A15 0A30
     nfc: 0A38 0A3C 0A41 0A71 0A15 0A30
localedata/locales/pa_IN:96: string not normalized:
  source: 0A36 0A28 0A3F 0A71 0A1A 0A30
     nfc: 0A38 0A3C 0A28 0A3F 0A71 0A1A 0A30
localedata/locales/pa_IN:104: string not normalized:
  source: 0A36 0A41 0A71 0A15 0A30 0A35 0A3E 0A30
     nfc: 0A38 0A3C 0A41 0A71 0A15 0A30 0A35 0A3E 0A30
localedata/locales/pa_IN:105: string not normalized:
  source: 0A36 0A28 0A3F 0A71 0A1A 0A30 0A35 0A3E 0A30
     nfc: 0A38 0A3C 0A28 0A3F 0A71 0A1A 0A30 0A35 0A3E 0A30
localedata/locales/pa_IN:139: string not normalized:
  source: 0A36 0A3E 0A2E
     nfc: 0A38 0A3C 0A3E 0A2E
localedata/locales/sgs_LT:85: string not normalized:
  source: 0070 0065 0074 006E 0069 0304 010D 0117
     nfc: 0070 0065 0074 006E 012B 010D 0117
localedata/locales/sgs_LT:117: string not normalized:
  source: 0074 0227 0304 0070
     nfc: 0074 01E1 0070
localedata/locales/sgs_LT:118: string not normalized:
  source: 006E 0065 0304
     nfc: 006E 0113
localedata/locales/sv_SE:121: string not normalized:
  source: 0041 0308
     nfc: 00C4
localedata/locales/sv_SE:123: string not normalized:
  source: 0041 030A
     nfc: 00C5
localedata/locales/sv_SE:127: string not normalized:
  source: 004F 0308
     nfc: 00D6
localedata/locales/sv_SE:131: string not normalized:
  source: 0061 0308
     nfc: 00E4
localedata/locales/sv_SE:133: string not normalized:
  source: 0061 030A
     nfc: 00E5
localedata/locales/sv_SE:137: string not normalized:
  source: 006F 0308
     nfc: 00F6
localedata/locales/wa_BE:67: string not normalized:
  source: 0041 030A
     nfc: 00C5
localedata/locales/wa_BE:68: string not normalized:
  source: 0061 030A
     nfc: 00E5
localedata/locales/yo_NG:79: string not normalized:
  source: 0045 0323
     nfc: 1EB8
localedata/locales/yo_NG:80: string not normalized:
  source: 0065 0323
     nfc: 1EB9
localedata/locales/yo_NG:85: string not normalized:
  source: 0045 0323 0301
     nfc: 1EB8 0301
localedata/locales/yo_NG:86: string not normalized:
  source: 0065 0323 0301
     nfc: 1EB9 0301
localedata/locales/yo_NG:87: string not normalized:
  source: 0045 0323 0300
     nfc: 1EB8 0300
localedata/locales/yo_NG:88: string not normalized:
  source: 0065 0323 0300
     nfc: 1EB9 0300
localedata/locales/yo_NG:90: string not normalized:
  source: 004F 0323
     nfc: 1ECC
localedata/locales/yo_NG:91: string not normalized:
  source: 006F 0323
     nfc: 1ECD
localedata/locales/yo_NG:96: string not normalized:
  source: 004F 0323 0301
     nfc: 1ECC 0301
localedata/locales/yo_NG:97: string not normalized:
  source: 006F 0323 0301
     nfc: 1ECD 0301
localedata/locales/yo_NG:98: string not normalized:
  source: 004F 0323 0300
     nfc: 1ECC 0300
localedata/locales/yo_NG:99: string not normalized:
  source: 006F 0323 0300
     nfc: 1ECD 0300
localedata/locales/yo_NG:101: string not normalized:
  source: 0053 0323
     nfc: 1E62
localedata/locales/yo_NG:102: string not normalized:
  source: 0073 0323
     nfc: 1E63
localedata/locales/yo_NG:104: string not normalized:
  source: 0041 0301
     nfc: 00C1
localedata/locales/yo_NG:105: string not normalized:
  source: 0061 0301
     nfc: 00E1
localedata/locales/yo_NG:106: string not normalized:
  source: 0045 0301
     nfc: 00C9
localedata/locales/yo_NG:107: string not normalized:
  source: 0065 0301
     nfc: 00E9
localedata/locales/yo_NG:108: string not normalized:
  source: 0049 0301
     nfc: 00CD
localedata/locales/yo_NG:109: string not normalized:
  source: 0069 0301
     nfc: 00ED
localedata/locales/yo_NG:110: string not normalized:
  source: 004D 0301
     nfc: 1E3E
localedata/locales/yo_NG:111: string not normalized:
  source: 006D 0301
     nfc: 1E3F
localedata/locales/yo_NG:112: string not normalized:
  source: 004E 0301
     nfc: 0143
localedata/locales/yo_NG:113: string not normalized:
  source: 006E 0301
     nfc: 0144
localedata/locales/yo_NG:114: string not normalized:
  source: 004F 0301
     nfc: 00D3
localedata/locales/yo_NG:115: string not normalized:
  source: 006F 0301
     nfc: 00F3
localedata/locales/yo_NG:116: string not normalized:
  source: 0055 0301
     nfc: 00DA
localedata/locales/yo_NG:117: string not normalized:
  source: 0075 0301
     nfc: 00FA
localedata/locales/yo_NG:119: string not normalized:
  source: 0041 0300
     nfc: 00C0
localedata/locales/yo_NG:120: string not normalized:
  source: 0061 0300
     nfc: 00E0
localedata/locales/yo_NG:121: string not normalized:
  source: 0045 0300
     nfc: 00C8
localedata/locales/yo_NG:122: string not normalized:
  source: 0065 0300
     nfc: 00E8
localedata/locales/yo_NG:123: string not normalized:
  source: 0049 0300
     nfc: 00CC
localedata/locales/yo_NG:124: string not normalized:
  source: 0069 0300
     nfc: 00EC
localedata/locales/yo_NG:127: string not normalized:
  source: 004E 0300
     nfc: 01F8
localedata/locales/yo_NG:128: string not normalized:
  source: 006E 0300
     nfc: 01F9
localedata/locales/yo_NG:129: string not normalized:
  source: 004F 0300
     nfc: 00D2
localedata/locales/yo_NG:130: string not normalized:
  source: 006F 0300
     nfc: 00F2
localedata/locales/yo_NG:131: string not normalized:
  source: 0055 0300
     nfc: 00D9
localedata/locales/yo_NG:132: string not normalized:
  source: 0075 0300
     nfc: 00F9
localedata/locales/yo_NG:139: string not normalized:
  source: 0041 0302
     nfc: 00C2
localedata/locales/yo_NG:140: string not normalized:
  source: 0061 0302
     nfc: 00E2
localedata/locales/yo_NG:141: string not normalized:
  source: 0045 0302
     nfc: 00CA
localedata/locales/yo_NG:142: string not normalized:
  source: 0065 0302
     nfc: 00EA
localedata/locales/yo_NG:143: string not normalized:
  source: 1EB8 0302
     nfc: 1EC6
localedata/locales/yo_NG:144: string not normalized:
  source: 1EB9 0302
     nfc: 1EC7
localedata/locales/yo_NG:145: string not normalized:
  source: 0049 0302
     nfc: 00CE
localedata/locales/yo_NG:146: string not normalized:
  source: 0069 0302
     nfc: 00EE
localedata/locales/yo_NG:147: string not normalized:
  source: 004F 0302
     nfc: 00D4
localedata/locales/yo_NG:148: string not normalized:
  source: 006F 0302
     nfc: 00F4
localedata/locales/yo_NG:149: string not normalized:
  source: 1ECC 0302
     nfc: 1ED8
localedata/locales/yo_NG:150: string not normalized:
  source: 1ECD 0302
     nfc: 1ED9
localedata/locales/yo_NG:151: string not normalized:
  source: 0055 0302
     nfc: 00DB
localedata/locales/yo_NG:152: string not normalized:
  source: 0075 0302
     nfc: 00FB
localedata/locales/yo_NG:154: string not normalized:
  source: 0041 030C
     nfc: 01CD
localedata/locales/yo_NG:155: string not normalized:
  source: 0061 030C
     nfc: 01CE
localedata/locales/yo_NG:156: string not normalized:
  source: 0045 030C
     nfc: 011A
localedata/locales/yo_NG:157: string not normalized:
  source: 0065 030C
     nfc: 011B
localedata/locales/yo_NG:160: string not normalized:
  source: 0049 030C
     nfc: 01CF
localedata/locales/yo_NG:161: string not normalized:
  source: 0069 030C
     nfc: 01D0
localedata/locales/yo_NG:162: string not normalized:
  source: 004F 030C
     nfc: 01D1
localedata/locales/yo_NG:163: string not normalized:
  source: 006F 030C
     nfc: 01D2
localedata/locales/yo_NG:166: string not normalized:
  source: 0055 030C
     nfc: 01D3
localedata/locales/yo_NG:167: string not normalized:
  source: 0075 030C
     nfc: 01D4
#!/usr/bin/python3
# Validate locale definitions.
# Copyright (C) 2017 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.

"""Validate locale definition files in ways that are too complicated
or too expensive to code into localedef.  This script is run over all
locale definitions as part of 'make check', when Python 3 is available.

Currently this performs two checks on each string within each file on
the command line: it must be in Unicode NFC, and it must be
representable in the legacy character set(s) declared in the SUPPORTED
file.

It also performs several checks on the overall syntax of the file:

Outside of comments, the only characters allowed are the ASCII graphic
characters (U+0021 through U+007E inclusive), U+0020 SPACE, U+0009
HORIZONTAL TAB, and U+000A NEW LINE; in particular, the other
characters counted as "whitespace" in the POSIX locale are NOT
allowed.  Inside comments, this rule is relaxed to permit most Unicode
characters (see inappropriate_unichars); we might in the future start
allowing "raw" Unicode text in strings as well.

Byte escapes (/xxx, where / is the escape character) are only to be
used to escape newline, ", <, >, and the escape character itself. All
other characters that can't be written directly should be written as
<Unnnn> instead.

The escape_char and comment_char directives' arguments are
sanity-checked: both take a single character, which must be an ASCII
graphic character and may not be any of , ; < > ".  Finally, the
escape character and the comment character may not be the same.

"..." strings and <...> symbols must be properly closed before the end
of the line.  Hard tabs are not permitted inside strings (write
<U0009> if you really mean to put a tab inside a string) and if
escape-newline is used to continue a string onto the next line, the
first character on the next line may not be a space (write <U0020> if
you really mean to do that).

"""

import argparse
import ctypes
import ctypes.util
import codecs
import contextlib
import errno
import functools
import itertools
import os
import re
import sys
import unicodedata

class ErrorLogger:
    """Object responsible for all error message output; keeps track of
       things like the file currently being processed, and whether any
       errors have so far been encountered."""
    def __init__(self, ofp, verbose):
        self.ofp     = ofp
        self.verbose = verbose
        self.status  = 0
        self.fname   = None
        self.fstatus = 0
        self.tblib   = None
        self.twlib   = None

    def begin_file(self, fname):
        self.fname   = fname
        self.fstatus = 0
        if self.verbose:
            self.ofp.write(self.fname)
            self.ofp.write("...")

    def end_file(self):
        if self.fstatus:
            self.status = 1
        elif self.verbose:
            self.ofp.write(" OK\n")

    def error(self, lno, message, *args):
        if self.verbose:
            if self.fstatus == 0:
                self.ofp.write("\n")
            self.ofp.write("  ")
        if args:
            message = message.format(*args)
        self.ofp.write("{}:{}: {}\n".format(self.fname, lno, message))

        self.fstatus = 1

    def oserror(self, filename, errmsg):
        # If all these things are true, the last thing printed was the
        # filename that provoked an OS error (e.g. we failed to open the
        # file we're logging for) so just print the error message.
        if self.verbose and self.fname == filename and self.fstatus == 0:
            self.ofp.write(errmsg)
            self.ofp.write("\n")
        else:
            if self.verbose:
                if self.fstatus == 0:
                    self.ofp.write("\n")
                self.ofp.write("  ")
            self.ofp.write("{}: {}\n".format(filename, errmsg))

        self.fstatus = 1

    def exception(self):
        exi = sys.exc_info()

        # The traceback module is lazily loaded since this method should
        # only need to be called if there's a bug in this program.
        if self.tblib is None:
            import traceback
            self.tblib = traceback

        if self.verbose:
            if self.fstatus == 0:
                self.ofp.write("\n")
            prefix = "  "
        else:
            prefix = ""
            self.ofp.write("{}: error:\n".format(self.fname))

        for msg in self.tblib.format_exception(*exi):
            for m in msg.split("\n"):
                if m:
                    self.ofp.write(prefix)
                    self.ofp.write(m)
                    self.ofp.write("\n")

        self.fstatus = 1

    def dump_codepoints(self, label, s):

        # The textwrap module is lazily loaded since this method should
        # only need to be called if there's a problem with the locale data.
        if self.twlib is None:
            import textwrap
            self.twlib = textwrap

        codepoints = [ord(c) for c in s]
        if any(c > 0xFFFF for c in codepoints):
            form = "06X"
        else:
            form = "04X"
        dumped = " ".join(format(c, form) for c in codepoints)
        if self.verbose:
            label = "  " + label
        self.ofp.write(self.twlib.fill(dumped, width=78,
                                       initial_indent=label,
                                       subsequent_indent=" "*len(label)))
        self.ofp.write("\n")

    def dump_bytes(self, label, bs):
        # The textwrap module is lazily loaded since this method should
        # only need to be called if there's a problem with the locale data.
        if self.twlib is None:
            import textwrap
            self.twlib = textwrap

        dumped = " ".join(format(c, "02X") for c in bs)
        if self.verbose:
            label = "  " + label
        self.ofp.write(self.twlib.fill(dumped, width=78,
                                       initial_indent=label,
                                       subsequent_indent=" "*len(label)))
        self.ofp.write("\n")

@contextlib.contextmanager
def logging_for_file(log, fname):
    try:
        log.begin_file(fname)
        yield
    except OSError as e:
        log.oserror(e.filename, e.strerror)
    except Exception:
        log.exception()
    finally:
        log.end_file()

#
# In order to match the behavior of localedef itself, we use ctypes to
# call iconv(3) from the C library, rather than using Python's codecs.
# This gets a bit ugly...
#

def prepare_raw_iconv(cls):
    c_char       = ctypes.c_char
    c_char_p     = ctypes.c_char_p
    c_int        = ctypes.c_int
    c_size_t     = ctypes.c_size_t
    c_void_p     = ctypes.c_void_p
    POINTER      = ctypes.POINTER
    get_errno    = ctypes.get_errno
    CDLL         = ctypes.CDLL
    find_library = ctypes.util.find_library
    strerror     = os.strerror

    # assumes iconv_t == void *
    class iconv_t(c_void_p): pass

    # it's simpler not to use the errcheck mechanism for iconv_open and iconv
    # (see below)

    ICONV_FAILURE = c_size_t(-1).value
    def get_iconv(lib):
        iconv = lib.iconv
        # The buffers passed to iconv are not necessarily nul-terminated,
        # so it is inappropriate to use c_char_p.
        iconv.argtypes = [iconv_t,
                          POINTER(POINTER(c_char)),
                          POINTER(c_size_t),
                          POINTER(POINTER(c_char)),
                          POINTER(c_size_t)]
        iconv.restype = c_size_t
        return iconv

    ICONV_OPEN_FAILURE = iconv_t(-1)
    def get_iconv_open(lib):
        iconv_open = lib.iconv_open
        # These arguments are 'const' but ctypes does not expose that.
        iconv_open.argtypes = [c_char_p, c_char_p]
        iconv_open.restype  = iconv_t
        return iconv_open

    def errcheck_iconv_close(result, fn, args):
        if result == 0: return
        err = get_errno()
        raise OSError(err, strerror(err))
    def get_iconv_close(lib):
        iconv_close = lib.iconv_close
        iconv_close.argtypes = [iconv_t]
        iconv_close.restype  = c_int
        iconv_close.errcheck = errcheck_iconv_close
        return iconv_close

    for libname in ["c", "iconv"]:
        try:
            lib = CDLL(find_library(libname), use_errno = True)
            cls.iconv_open = get_iconv_open(lib)
            cls.iconv_close = get_iconv_close(lib)
            cls.iconv = get_iconv(lib)
            cls.OPEN_FAILURE = ICONV_OPEN_FAILURE
            cls.ICONV_FAILURE = ICONV_FAILURE
            return cls

        except AttributeError:
            continue

    raise ImportError("iconv primitives not found")

@prepare_raw_iconv
class RawIconv: pass

#
# IconvConverter is a somewhat more ergonomic interface to iconv.
# Caution: the EILSEQ logic assumes that the source encoding is
# fixed-width ('from_width' bytes per character) and the destination
# encoding is multibyte ASCII-compatible (specifically, 0x1A == U+001A
# regardless of surrounding context).
#

class IconvConverter:
    def __init__(self, to_encoding, from_encoding, from_width):
        self.from_width    = from_width
        self.from_encoding = from_encoding
        self.to_encoding   = to_encoding
        self.cd = RawIconv.iconv_open(to_encoding.encode("ascii"),
                                      from_encoding.encode("ascii"))
        if self.cd == RawIconv.OPEN_FAILURE:
            err = ctypes.get_errno()
            raise OSError(err, os.strerror(err), from_encoding, -1, to_encoding)

    def __del__(self):
        self.close()

    def close(self):
        if self.cd is not None:
            RawIconv.iconv_close(self.cd)
            self.cd = None

    def __call__(self, ibuf):
        c_size_t   = ctypes.c_size_t
        c_char_p   = ctypes.c_char_p
        c_void_p   = ctypes.c_void_p
        c_char     = ctypes.c_char
        POINTER    = ctypes.POINTER
        c_char_ptr = POINTER(ctypes.c_char)
        addressof  = ctypes.addressof
        byref      = ctypes.byref
        cast       = ctypes.cast
        get_errno  = ctypes.get_errno
        resize     = ctypes.resize

        PVALUE     = lambda p: cast(p, c_void_p).value
        INCPTR     = lambda p, n: cast(c_void_p(PVALUE(p) + n), c_char_ptr)
        INCSIZE    = lambda s, n: c_size_t(s.value + n)

        iconv         = RawIconv.iconv
        ICONV_FAILURE = RawIconv.ICONV_FAILURE
        E2BIG         = errno.E2BIG
        EILSEQ        = errno.EILSEQ

        iptr  = cast(c_char_p(ibuf), c_char_ptr)
        ileft = c_size_t(len(ibuf))

        obuf  = ctypes.create_string_buffer(len(ibuf))
        optr  = cast(obuf, c_char_ptr)
        olen  = c_size_t(len(ibuf))
        oleft = c_size_t(olen.value)

        def grow_obuf():
            nonlocal olen, optr, oleft
            # The resize call may move obuf, so we have to recalculate
            # optr afterward.
            ooff = PVALUE(optr) - addressof(obuf)
            olen = c_size_t(olen.value * 2)
            resize(obuf, olen.value)
            optr = cast(c_void_p(addressof(obuf) + ooff), c_char_ptr)
            oleft = c_size_t(olen.value - ooff)


        while True:
            rv = iconv(self.cd,
                       byref(iptr), byref(ileft),
                       byref(optr), byref(oleft))

            if rv != ICONV_FAILURE:
                assert ileft.value == 0
                return obuf.raw[:(PVALUE(optr) - addressof(obuf))]

            err = get_errno()
            if err == E2BIG:
                # Double the size of the output buffer.
                grow_obuf()

            elif err == EILSEQ:
                # Skip 'from_width' bytes of input.
                iptr = INCPTR(iptr, self.from_width)
                ileft = INCSIZE(ileft, -self.from_width)

                # Insert a U+001A SUBSTITUTE into the output buffer.
                # (We assume that the destination encoding is a
                # multibyte, ASCII-compatible encoding.)
                if oleft.value <= 1:
                    grow_obuf()
                obuf[PVALUE(optr) - addressof(obuf)] = 0x1A
                optr = INCPTR(optr, 1)
                oleft = INCSIZE(oleft, -1)

            else:
                raise OSError(err, os.strerror(err),
                              self.from_encoding, -1, self.to_encoding)

#
# And this is the interface the high-level code actually uses.
# It's instantiated with a "destination" encoding, and you feed it
# strings.
#

class TranscodingChecker:
    # UTF-32 is used internally, despite the overhead, to ensure that
    # no matter what characters are involved, IconvConverter's "skip a
    # fixed number of bytes on error" strategy will work.
    TO_UTF_32 = codecs.lookup("UTF-32BE")

    def __init__(self, encoding):
        self.encoding = encoding
        self.conv_subst = IconvConverter(encoding, "UTF-32BE", 4)
        self.conv_trans = IconvConverter(encoding + "//TRANSLIT",
                                         "UTF-32BE", 4)

    def __call__(self, text):
        t_ucs4 = self.TO_UTF_32.encode(text)[0]
        t_et = self.conv_trans(t_ucs4)
        t_es = self.conv_subst(t_ucs4)

        # If an 0x1a byte appears in the transliterated string, there
        # was a character in the original that could not be encoded in
        # the target encoding, even with transliteration on: fail.
        if b'\x1a' in t_et:
            return (False, t_es, t_et)

        # If there are no 0x1a bytes in the transliterated string,
        # and the transliterated and non-transliterated strings are
        # identical, then none of the characters on the original
        # required transliteration: success.
        if t_et == t_es:
            return (True,)

        # If there are no 0x1a bytes in the transliterated string,
        # and there are also no 0x1a bytes in the non-transliterated
        # string, but the transliterated and non-transliterated
        # strings are not the same, then there is something wrong
        # with the conversion to this character encoding: fail.
        if b'\x1a' not in t_es:
            return (False, t_es, t_et)

        # If we get to this point, there are 0x1a bytes in the
        # non-transliterated string, but not in the transliterated
        # string, which means some of the characters in the original
        # were replaced with transliterations.  Verify that each 0x1a
        # byte corresponds to a transliteration sequence and that
        # nothing was lost.  As a special case, if the target encoding
        # is UTF-8, this is an automatic failure, because conversion
        # to UTF-8 should never require transliteration.
        if self.encoding == "UTF-8":
            return (False, t_es, t_et)

        pat = (b'\\A' +
               b'.*?'.join(re.escape(s)  for s in t_es.split(b'\x1a')) +
               b'\\Z')
        if re.match(pat, t_et, re.DOTALL):
            return (True,)
        else:
            return (False, t_es, t_et)


#
# Regular expressions used by the parser.
#
def re_escape_for_cc(x):
    return (x if x not in '-\\^]' else '\\' + x)
def make_cc(chars, inverse=False):
    chars = ''.join(re_escape_for_cc(x) for x in sorted(chars))
    if inverse:
        return '[^' + chars + ']'
    else:
        return '[' + chars + ']'

graphic_chars = set(chr(c) for c in range(0x21, 0x7F))

# In most contexts, the only characters that should appear in a localedef
# file are the ASCII graphic characters plus space.  Tab and newline are
# also permitted, but only _between_ tokens, not _inside_ them.
ok_ascii_chars = graphic_chars | {' '}

# In comments only, arbitary Unicode characters are allowed, but not
# the legacy control characters (except \t and \n), nor the Unicode
# NIH line-breaking characters, nor bare surrogates, nor noncharacters.
#
# Private-use, not-yet-assigned, and format controls (Cf) are fine,
# except that BYTE ORDER MARK (U+FEFF) is not allowed.
#
# OBJECT REPLACEMENT CHARACTER (U+FFFC) and REPLACEMENT CHARACTER (U+FFFD)
# are officially "symbols", but we disallow them as well, because
# their presence in a locale file means something has gone wrong
# somewhere.
inappropriate_unicode = re.compile(make_cc(chr(c) for c in itertools.chain(
    range(0x0000, 0x0009),
    range(0x000B, 0x0020),
    range(0x007F, 0x00A0),
    range(0xD800, 0xE000),
    range(0xFDD0, 0xFDF0),
    (i * 0x10000 + 0xFFFE for i in range(0x11)),
    (i * 0x10000 + 0xFFFF for i in range(0x11)),
    (0x2028, 0x2029, 0xFEFF, 0xFFFC, 0xFFFD)
)))

@functools.lru_cache(maxsize=32)
def compile_token_re(escape_char, comment_char):

    special_chars = { escape_char, comment_char, ',', ';', '<', '>', '"' }
    wordchars = make_cc(graphic_chars - special_chars)

    # Note: POSIX specifically says that comments are _not_ continued
    # onto the next line by the escape_char.
    abstract_token_re = r"""(?msx)
             (?P<COMMA>    ,                        )
      |      (?P<SEMI>     ;                        )
      |      (?P<NEWLINE>  \n                       )
      |      (?P<WHITE>    [ \t]+                   )
      |      (?P<WORD>     (?:{wordchars}|{ec}.)+  )
      | "    (?P<STRING>   (?:[^"\n{ec}]|{ec}.)*    ) (?:"|$)
      | <    (?P<SYMBOL>   (?:[^>\n{ec}]|{ec}.)*    ) (?:>|$)
      | {cc} (?P<COMMENT>  [^\n]*                   )
      |      (?P<BAD>      .                        )
    """

    return re.compile(abstract_token_re.format(
        wordchars = wordchars,
        ec        = re.escape(escape_char),
        cc        = re.escape(comment_char)))

@functools.lru_cache(maxsize=32)
def compile_esc_re(escape_char):
    return re.compile(r"""(?six)
        {ec} (?P<ESC> [0-7]{{1,3}} | d[0-9]{{1,3}} | x[0-9a-f]{{1,2}} | . )
      |   <U (?P<UNI> [0-9a-f]{{1,8}} ) >
    """.format(ec=re.escape(escape_char)))

@functools.lru_cache(maxsize=32)
def compile_inappropriate_ascii_or_esc_re(escape_char):
    """Returns a regex that matches any 'inappropriate' (non-)ASCII character
       and also any escape sequence _other_ than the ones we want to allow
       (namely \\, \<, \>, \").  It _does_ match \-newline even though
       \-newline is allowed; this is because we need to diagnose \-newline
       followed immediately by whitespace.  It does also match \\ even though
       that is allowed; this is to force the regex engine _not_ to match \f
       from \\foo.
    """
    inappropriate_char = make_cc(ok_ascii_chars, inverse=True)
    after_ec = make_cc(('<', '>', '"', '\n'), inverse=True)
    return re.compile(r"""(?six)
      {ec} (?: [0-7]{{1,3}} | d[0-9]{{1,3}} | x[0-9a-f]{{1,2}}
             | \n .? | {after_ec} )
    | {inappropriate_char}
    """.format(ec=re.escape(escape_char),
               after_ec=after_ec,
               inappropriate_char=inappropriate_char))

directive_re = re.compile(
    r"[ \t]*(comment|escape)_char[ \t]*([^\n\t ]*)(?:[ \t][^\n]*)?(\n|\Z)")

def scan_localedef(fp, log):
    """Scan through a locale definition file, FP.  Returns a list of
       all strings appearing in the file, as 2-tuples (lno, string).
       May also emit error messages.
       Assumes that log.begin_file() has been called for the file FP.
    """
    strings = []
    escape_char = '\\'
    comment_char = '#'
    lno = 1
    data = fp.read()

    def decode_escapes(m):
        g = m.lastgroup
        c = m.group(g)
        if g == "UNI":
            try:
                return chr(int(c, 16))
            except (UnicodeError, ValueError):
                log.error(lno, "invalid token '<U{}>' in string", c)
        else:
            if c == '\n':
                return ''

            if len(c) == 1 and c not in "01234567":
                return c

            p = c[0]
            if p in ('d', 'D'):
                base = 10
                digits = c[1:]
            elif p in ('x', 'X'):
                base = 16
                digits = c[1:]
            else:
                base = 8
                digits = c
            try:
                return chr(int(digits, base))
            except ValueError:
                log.error("invalid escape sequence '{!r}' in string",
                          escape_char + c)

    def diagnose_syntax(m, kind, closer):
        nonlocal lno

        value = m.group(kind)
        if closer:
            # Check for close quote.
            end = m.end(kind)
            if len(m.string) == end or m.string[end] != closer:
                log.error(lno, "missing close '{}' character", closer)

        for c in inappropriate_ascii_or_escs(value):
            if len(c) == 1:
                log.error(lno, "inappropriate character {!r}", c)
            elif c[1] == '\n':
                lno += 1
                if len(c) > 2:
                    assert len(c) == 3
                    d = c[2]
                    if d == ' ' or d == '\t':
                        log.error(lno, "leading whitespace "
                                  "after escaped newline")
                    elif d == '\n':
                        lno += 1
                    elif d not in graphic_chars:
                        log.error(lno, "inappropriate character {!r}", d)
            elif c[1] != escape_char:
                log.error(lno, "inappropriate escape sequence {!r}", c)

    # We only recognize the 'escape_char' and 'comment_char' directives
    # if they appear (in either order) on the very first one or two lines
    # in the file.
    for _ in range(2):
        m = directive_re.match(data)
        if not m: break

        if m.group(3) == '\n': lno += 1
        data = data[m.end():]
        which = m.group(1)
        arg = m.group(2)
        if len(arg) == 0:
            log.error(lno, "missing argument to {}_char directive", which)
        elif len(arg) != 1:
            log.error(lno, "argument to {}_char must be a single character",
                      which)
        elif not ("!" <= arg <= "~" and arg not in ',;<>"'):
            log.error(lno, "{}_char may not be set to {!r}", which, arg)
        elif which == "comment":
            comment_char = m.group(2)
        else:
            escape_char = m.group(2)

    if comment_char == escape_char:
        log.error("comment_char and escape_char both set to {}", comment_char)
        escape_char = '\\'
        comment_char = '#'

    token_iter = compile_token_re(escape_char, comment_char).finditer
    esc_sub = compile_esc_re(escape_char).sub
    inappropriate_ascii_or_escs = \
        compile_inappropriate_ascii_or_esc_re(escape_char).findall
    inappropriate_unicodes = inappropriate_unicode.findall


    for m in token_iter(data):
        kind = m.lastgroup

        if kind == "NEWLINE":
            lno += 1

        elif kind == "BAD":
            log.error(lno, "inappropriate character {!r}", m.group(kind))

        elif kind == "COMMENT":
            for c in inappropriate_unicodes(m.group(kind)):
                log.error(lno, "inappropriate character {!r}", c)

        elif kind == "WORD":
            diagnose_syntax(m, kind, None)

        elif kind == "SYMBOL":
            diagnose_syntax(m, kind, '>')

        elif kind == "STRING":
            diagnose_syntax(m, kind, '"')
            value = m.group(kind)

            s = esc_sub(decode_escapes, value)
            if s:
                strings.append((lno, s))

        #else: other token types are currently ignored

    return strings

def process(fp, log, charsets):
    strings = scan_localedef(fp, log)

    for lno, s in strings:
        nfc_s = unicodedata.normalize("NFC", s)
        if s != nfc_s:
            log.error(lno, "string not normalized:")
            log.dump_codepoints("  source: ", s)
            log.dump_codepoints("     nfc: ", nfc_s)

        for charset, checker in charsets:
            rv = checker(nfc_s)
            if not rv[0]:
                log.error(lno, "string not representable in {}:", charset)
                log.dump_codepoints("    ", nfc_s)
                log.error(lno, "  without transliteration:")
                log.dump_bytes("    ", rv[1])
                log.error(lno, "  with transliteration:")
                log.dump_bytes("    ", rv[2])

def scan_supported_locales(fp, log):
    charsets = {}
    checkers = {}
    split_xlocale = re.compile(r"^([^.]*)[^@]*(.*)$")

    for z_lno, line in enumerate(fp):
        if not line: continue
        if line[0] == "#": continue
        if line == "SUPPORTED-LOCALES=\\\n": continue

        locale_code = line.split()[0]

        # Everything after the first slash names the character set.
        xlocale, _, charset = locale_code.partition('/')
        charset = charset.upper()

        # 'xlocale' is in three pieces, of which two are optional:
        # base_locale [.encoding] [@variation]
        # The [.encoding] part needs to be removed, but the [@variation]
        # part should remain.
        locale = split_xlocale.sub(r"\1\2", xlocale)

        if locale not in charsets:
            charsets[locale] = set()

        if charset in checkers:
            charsets[locale].add((charset, checkers[charset]))
        else:
            try:
                co = TranscodingChecker(charset)
            except Exception as e:
                log.error(z_lno + 1, "unknown charset {!r} for {}: {}",
                          charset, locale, e)
                continue

            checkers[charset] = co
            charsets[locale].add((charset, co))

    return charsets

def process_files(args):
    logger = ErrorLogger(sys.stderr, args.verbose)

    charsets = {}
    if args.supported:
        with logging_for_file(logger, args.supported), \
             open(args.supported, "rt", encoding=args.encoding) as fp:
            charsets = scan_supported_locales(fp, logger)

    if args.files:
        files = set(args.files)
    else:
        files = set(charsets.keys())

    unsupported = []
    for f in sorted(set(files)):
        cs = charsets.get(os.path.basename(f), [])
        if args.supported and not cs:
            unsupported.append(f)

        if args.locales_path and "/" not in f:
            f = os.path.join(args.locales_path, f)

        with logging_for_file(logger, f), \
             open(f, "rt", encoding=args.encoding) as fp:
            process(fp, logger, cs)

    if unsupported:
        sys.stderr.write("note: locales not in {}: {}\n"
                         .format(args.supported, " ".join(unsupported)))

    return logger.status

def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("-v", "--verbose", action="store_true")
    ap.add_argument("-e", "--source-encoding", default="UTF-8", dest="encoding")
    ap.add_argument("-f", "--supported-locales-file", dest="supported")
    ap.add_argument("-p", "--locales-path")
    ap.add_argument("files", nargs="*")
    args = ap.parse_args()

    if not args.files and not args.supported:
        ap.error("must provide either -f or locale definitions")

    sys.exit(process_files(args))

main()

# Local Variables:
# indent-tabs-mode: nil
# End:

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]