This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

python script to sync cldr db w/glibc localedata

From: Mike Frysinger <vapier at gentoo dot org>
To: libc-alpha at sourceware dot org
Date: Tue, 9 Feb 2016 23:26:15 -0500
Subject: python script to sync cldr db w/glibc localedata
Authentication-results: sourceware.org; auth=none

here's the current version of my script.  it updates most, but not all,
fields.  the format ones are pretty hard to extract, and the diff to the
current glibc db is significant.  i'll post this as a proper patch once
master opens for 2.24 and i can land the pending locale updates.
-mike

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Written by Mike Frysinger <vapier@gentoo.org> for much great glory.

"""Helper tool for importing current CLDR data.

See http://cldr.unicode.org/ for more details."""

# TODO: Need to handle copy directives better so we can see when a value
# has changed for a specific locale, but it's copying the (wrong) values
# from others.
# TODO: Add missing fields.
# TODO: Add support for updating locale/iso-3166.def via supplementalData.xml.
# TODO: Add support for updating locale/iso-4217.def.

from __future__ import print_function

import argparse
import datetime
import errno
import logging
import os
import re
import subprocess
import sys
import time
from xml.etree import ElementTree


# Where to store CLDR/etc... data files we fetch.
DEFAULT_WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'cldr-%(version)s')

# Whether we should clean up newlines/comments.
REWRITE_STYLE = False


def u_encode(text):
    """Convert unicode |text| to <U####> format."""
    return ''.join('<U%04X>' % ord(x) for x in text)


_U_MATCH = re.compile(r'<U([0-9A-Fa-f]+)>')
def u_decode(text):
    """Convert <U####> format in |text|."""
    unirep = lambda m: chr(int(m.group(1), 16))
    return _U_MATCH.sub(unirep, text)


def get_parser():
    """Return an argument parser for this module."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--working-dir', default=DEFAULT_WORKING_DIR,
                        help='Where to download files (default: %(default)s)')
    parser.add_argument('-v', '--version', default=Cldr.CURR_VERSION,
                        help='Version of CLDR to use (default: %(default)s)')
    parser.add_argument('locales', nargs='*', help='Locales to generate')
    return parser


def logging_init(debug=False):
    """Set up the logging module."""
    fmt = '%(asctime)s: %(levelname)-7s: '
    fmt += '%(message)s'
    # 'Sat, 05 Oct 2013 18:58:50 -0400 (EST)'
    tzname = time.strftime('%Z', time.localtime())
    datefmt = '%a, %d %b %Y %H:%M:%S ' + tzname
    level = logging.DEBUG if debug else logging.INFO
    handler = logging.StreamHandler(stream=sys.stdout)
    formatter = logging.Formatter(fmt, datefmt)

    handler.setFormatter(formatter)

    logger = logging.getLogger()
    logger.addHandler(handler)
    logger.setLevel(level)


class cached_property(object):  # pylint: disable=invalid-name
    """Like @property but cached"""

    def __init__(self, func):
        self.func = func

    def __get__(self, instance, _owner):
        if instance is None:
            return self
        value = instance.__dict__[self.func.__name__] = self.func(instance)
        return value


class Iso639(object):
    """Content for the ISO-639 database."""

    # Link to upstream ISO-639-2 database.
    ISO639_2_URI = 'http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt'

    # Path to our local copy of the ISO-639 database.
    PATH = os.path.join(os.path.dirname(os.path.dirname(
        os.path.realpath(__file__))), 'locale', 'iso-639.def')

    # Regex to process our local ISO-639 def file.
    _LINE_MATCH = re.compile(
        r'^(DEFINE_LANGUAGE_CODE \("([^"]*)", ([^,]*), ([^,]*), ([^,]*)\)'
        r'|DEFINE_LANGUAGE_CODE3 \("([^"]*)", ([^,]*), ([^,]*)\))$')

    def __init__(self):
        self.db = {}
        with open(self.PATH) as fp:
            for line in fp:
                m = self._LINE_MATCH.match(line)
                if m:
                    if m.group(1) is None:
                        # DEFINE_LANGUAGE_CODE3 form.
                        self.db[m.group(6)] = (m.group(5), m.group(7))
                    else:
                        # DEFINE_LANGUAGE_CODE form.
                        self.db[m.group(2)] = (m.group(1), m.group(3),
                                               m.group(4))

    def get_term(self, lang):
        """Return the ISO 639-2/T (Terminology) code."""
        entry = self.db.get(lang, ())
        if len(entry) == 3:
            return entry[1]

    def get_bib(self, lang):
        """Return the ISO 639-2/B (Bibliographic) code."""
        entry = self.db.get(lang, ())
        if len(entry) == 3:
            return entry[2]

    def _download_uri(self, path):
        """Download the ISO-639-2 db."""
        iso639 = os.path.join(path, os.path.basename(self.ISO639_2_URI))
        if not os.path.exists(iso639):
            subprocess.check_call(['wget', '-O', iso639, self.ISO639_2_URI])
        self._load_iso639(iso639)

    @staticmethod
    def _load_iso639(db):
        """Load ISO-639-2 database.

        http://www.loc.gov/standards/iso639-2/ascii_8bits.html

        An alpha-3 (bibliographic) code,
        an alpha-3 (terminologic) code (when given),
        an alpha-2 code (when given),
        an English name, and
        a French name of a language are all separated by pipe (|) characters.
        """
        db = {}
        with open(db) as fp:
            for line in fp:
                bcode, tcode, code, _en, _fr = line.rstrip().split('|')
                if code:
                    db[code] = (bcode, tcode)
        return db


class CldrLocale(object):
    """Content for a single locale in the cldr database."""

    _DAY_KEYS = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')

    def __init__(self, cldr, locale, iso639):
        self.cldr = cldr
        self.locale = locale
        self.locale_root = cldr.load_lang(locale)
        self.lang_root = cldr.load_lang(self.lang)
        self.iso639 = iso639

    @cached_property
    def lang(self):
        """The locale's short language code."""
        root = self.locale_root.find('identity/language')
        return root.get('type')

    @cached_property
    def territory(self):
        """The locale's short territory code."""
        root = self.locale_root.find('identity/territory')
        return root.get('type')

    @cached_property
    def en_lang(self):
        """The name of the language in English."""
        root = self.cldr.load_lang('en')
        names = root.find('localeDisplayNames')
        # First see if the locale has a name before we fall back to the lang.
        langs_root = names.find('languages')
        lang_root = langs_root.find('language[@type="%s"]' % self.locale)
        if lang_root is None:
            lang_root = langs_root.find('language[@type="%s"]' % self.lang)
        return lang_root.findtext('.')

    @cached_property
    def en_territory(self):
        """The name of the territory in English."""
        root = self.cldr.load_lang('en')
        names = root.find('localeDisplayNames')
        return names.find('territories/territory[@type="%s"]' %
                          self.territory).findtext('.')

    @cached_property
    def country_ab2(self):
        """Two-letter ISO-3166 country code."""
        # TODO: Implement this.

    @cached_property
    def country_ab3(self):
        """Three-letter ISO-3166 country code."""
        # TODO: Implement this.

    @cached_property
    def lang_name(self):
        """The localized name for the language."""
        for root in (self.locale_root, self.lang_root):
            names = root.find('localeDisplayNames')
            if names is not None:
                langs_root = names.find('languages')
                if langs_root is not None:
                    lang_root = langs_root.find('language[@type="%s"]' %
                                                self.lang)
                    if lang_root is not None:
                        return lang_root.findtext('.')

    @cached_property
    def lang_term(self):
        """Three-letter ISO 639-2/T (Terminology) code"""
        return self.iso639.get_term(self.lang)

    @cached_property
    def lang_lib(self):
        """Three-letter ISO 639-2/B (Bibliographic) code"""
        return self.iso639.get_bib(self.lang)

    @cached_property
    def country_name(self):
        """The localiezd name for the territory."""
        for root in (self.locale_root, self.lang_root):
            names = root.find('localeDisplayNames')
            if names is not None:
                name = names.find('territories/territory[@type="%s"]' %
                                  self.territory)
                if name is not None:
                    return name.findtext('.')

    @cached_property
    def country_num(self):
        """ISO 3166-1 numeric code"""
        root = self.cldr.load_supp('supplementalData')
        codes = root.find('codeMappings/territoryCodes[@type="%s"]' %
                          self.territory)
        return codes.get('numeric')

    @cached_property
    def country_term(self):
        """ISO 3166-1 alpha-3 code"""
        root = self.cldr.load_supp('supplementalData')
        codes = root.find('codeMappings/territoryCodes[@type="%s"]' %
                          self.territory)
        return codes.get('alpha3').lower()

    @cached_property
    def tel_int_fmt(self):
        """Telephone format for international calling."""
        # TODO: Implement this.

    @cached_property
    def tel_dom_fmt(self):
        """Telephone format for domestic calling."""
        # TODO: Implement this.

    @cached_property
    def int_select(self):
        """Telephone prefix for calling international numbers."""
        # TODO: Implement this.

    @cached_property
    def int_prefix(self):
        """Telephone international country code prefix."""
        root = self.cldr.load_supp('telephoneCodeData')
        code = root.find('telephoneCodeData/codesByTerritory[@territory="%s"]'
                         '/telephoneCountryCode' % self.territory)
        return code.get('code')

    @cached_property
    def int_curr_symbol(self):
        """Need to rectify w/locale/iso-4217.def."""
        # The xmlpath support in python is not complete, so we need to search
        # for the currency w/missing @to attribute ourselves.
        root = self.cldr.load_supp('supplementalData')
        currencies = root.find('currencyData/region[@iso3166="%s"]' %
                               self.territory)
        for currency in currencies.getchildren():
            if 'to' not in currency.keys():
                break
        else:
            raise ValueError('Could not find a currency for %s' %
                             self.territory)
        return currency.get('iso4217')

    @cached_property
    def currency_symbol(self):
        """Need to rectify w/locale/iso-4217.def."""
        # First search the locale, then the lang dbs.
        for root in (self.locale_root, self.lang_root):
            numbers_root = root.find('numbers')
            if numbers_root is None:
                continue
            symbol_ele = numbers_root.find('currencies/currency[@type="%s"]'
                                           '/symbol' % self.int_curr_symbol)
            if symbol_ele is not None:
                return symbol_ele.findtext('.')

        # Try the common currency database.
        chars_root = self.cldr.load_supp('characters')
        for symbol_ele in chars_root.find('characters'
                                          '/character-fallback').getchildren():
            if symbol_ele.findtext('substitute') == self.int_curr_symbol:
                return symbol_ele.get('value')

        # A few symbols have no translation.
        return self.int_curr_symbol

    @cached_property
    def number_system(self):
        """Get the active number system for this locale."""
        for root in (self.locale_root, self.lang_root):
            numbers_root = root.find('numbers')
            if numbers_root is None:
                continue

            # If there's a default labeled, use it.  Otherwise just go with
            # the first one found.  It should be the only one.
            num_sys_ele = root.find('defaultNumberingSystem')
            if num_sys_ele is None:
                return numbers_root.find('symbols')
            else:
                return numbers_root.find('symbols[@numberSystem="%s"]' %
                                         num_sys_ele.findtext('.'))

    @cached_property
    def decimal_point(self):
        """The symbol used to denote decimal points."""
        num_symbols_root = self.number_system
        try:
            return num_symbols_root.find('decimal').findtext('.')
        except AttributeError:
            return None

    @cached_property
    def thousands_sep(self):
        """The symbol used to group thousands digits."""
        num_symbols_root = self.number_system
        try:
            return num_symbols_root.find('group').findtext('.')
        except AttributeError:
            return None

    @cached_property
    def grouping(self):
        # TODO: Implement this.
        pass

    def _lookup_day_mon(self, cal_field, cal_type, cal_idxs):
        """Look up various calendar fields."""
        for root in (self.locale_root, self.lang_root):
            dates_root = root.find('dates')
            if dates_root is None:
                continue
            calendars_root = dates_root.find('calendars')
            if calendars_root is None:
                continue
            # XXX: Look up type in calendarPreference ?
            calendar_root = calendars_root.find('calendar[@type="gregorian"]')
            if calendar_root is None:
                continue

            dm_root = None
            for key in ('stand-alone', 'format', 'narrow'):
                ctx_root = calendar_root.find('%ss/%sContext[@type="%s"]' %
                                              (cal_field, cal_field, key))
                if ctx_root is None:
                    continue
                dm_root = ctx_root.find('%sWidth[@type="%s"]' %
                                        (cal_field, cal_type))
                if dm_root is None:
                    continue

                ret = [dm_root.find('%s[@type="%s"]' % (cal_field, x))
                       for x in cal_idxs]
                if None not in ret:
                    return [x.findtext('.') for x in ret]

    def _lookup_day(self, width_type):
        """Internal helper for abday/day lookups."""
        return self._lookup_day_mon('day', width_type, self._DAY_KEYS)

    def _lookup_mon(self, width_type):
        """Internal helper for abmon/mon lookups."""
        return self._lookup_day_mon('month', width_type, range(1, 13))

    @cached_property
    def abday(self):
        """Abbreviated localized names for the days of the week."""
        return self._lookup_day('abbreviated')

    @cached_property
    def day(self):
        """Full localized names for the days of the week."""
        return self._lookup_day('wide')

    @cached_property
    def abmon(self):
        """Abbreviated localized names for the months."""
        return self._lookup_mon('abbreviated')

    @cached_property
    def mon(self):
        """Full localized names for the months."""
        return self._lookup_mon('wide')

    # http://www.unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns
    _CLDR_TO_POSIX_FMT = {
        # year
        'y':    '%%y',
        'yy':   '%%y',
        'yyyy': '%%Y',
        # month
        'M':    '%%m',
        'MM':   '%%m',
        'MMM':  '%%b',
        'MMMM': '%%B',
        # day
        'd':    '%%d',
        'dd':   '%%d',
        # period
        'a':    '%%p',
        # hour
        'h':    '%%I',
        'hh':   '%%I',
        'H':    '%%H',
        'HH':   '%%H',
        # minute
        'm':    '%%M',
        'mm':   '%%M',
        # second
        's':    '%%S',
        'ss':   '%%S',
    }

    @classmethod
    def _to_posix_fmt(cls, fmt):
        """Convert the CLDR notation to what POSIX uses."""
        lookup = lambda m: cls._CLDR_TO_POSIX_FMT[m.group(1)]
        return re.sub(r'\b(' + '|'.join(cls._CLDR_TO_POSIX_FMT.keys()) + r')\b',
                      lookup, fmt)

    @cached_property
    def hours_format(self):
        """Return 24 or 12 depending on preferred %H or %h format"""
        root = self.cldr.load_supp('supplementalData')
        datasets = root.find('timeData')
        pref = None
        for dataset in datasets.findall('hours'):
            territories = dataset.get('regions')
            value = dataset.get('preferred')

            # TODO: Make this walk logic more robust/common.
            territories = territories.split()
            if '001' in territories:
                if pref is None:
                    # The allowed field makes this tricky.
                    #pref = value
                    pass
            if self.territory in territories:
                pref = value

        if pref == 'H':
            return '24'
        elif pref == 'h':
            return '12'
        elif pref is None:
            return None
        else:
            raise ValueError('Unknown hour value: %s' % pref)

    @cached_property
    def am_pm(self):
        """Localized AM/PM time fields when 12 hour clocks are used."""
        if self.hours_format == '24':
            return ['', '']
        elif self.hours_format is None:
            return None

        return self._lookup_day_mon('dayPeriod', 'abbreviated', ('am', 'pm'))

    def _lookup_d_t_fmt(self, dt, dt_type='medium'):
        """Internal helper for various fmt lookups."""
        for root in (self.locale_root, self.lang_root):
            dates_root = root.find('dates')
            if dates_root is None:
                continue
            calendars_root = dates_root.find('calendars')
            # XXX: Look up type in calendarPreference ?
            calendar_root = calendars_root.find('calendar[@type="gregorian"]')

            fmts = calendar_root.find('%sFormats/%sFormatLength[@type="%s"]'
                                      '/%sFormat/pattern' %
                                      (dt, dt, dt_type, dt))
            if fmts is not None:
                return fmts.findtext('.')

    @cached_property
    def d_t_fmt(self):
        """Appropriate date and time representation (%c)

        Example:
        $ date +'%a %d %b %Y %r %Z'
        Tue 09 Feb 2016 06:39:48 PM EST
        """
        return self._to_posix_fmt(
            self._lookup_d_t_fmt('dateTime').replace(
                '{0}', self._t_fmt).replace(
                    '{1}', self._d_fmt))

    @cached_property
    def _d_fmt(self):
        """Internal helper for the raw d_fmt field."""
        return self._lookup_d_t_fmt('date')

    @cached_property
    def d_fmt(self):
        """Appropriate date representation (%x)

        Example:
        $ date +'%m/%d/%Y'
        02/09/2016
        """
        return self._to_posix_fmt(self._d_fmt)

    @cached_property
    def _t_fmt(self):
        """Internal helper for the raw t_fmt field."""
        return self._lookup_d_t_fmt('time')

    @cached_property
    def t_fmt(self):
        """Appropriate time representation (%X)

        Example:
        $ date +%r
        06:41:21 PM
        """
        return self._to_posix_fmt(self._t_fmt)

    @cached_property
    def t_fmt_ampm(self):
        """Appropriate AM/PM time representation (%r)

        Example:
        $ date +'%I:%M:%S %p'
        06:41:21 PM
        """
        if self.hours_format == '24':
            return ''
        elif self.hours_format is None:
            return None

        return None

    @cached_property
    def date_fmt(self):
        """Appropriate date representation (date(1))

        $ date +'%a %b %e %H:%M:%S %Z %Y'
        Tue Feb  9 06:39:48 EST 2016
        """
        pass

    @cached_property
    def week(self):
        """DAYSINWEEK;WEEKSTARTDATE;MINWEEKLEN field"""
        pass

    @cached_property
    def first_weekday(self):
        """Number of day in the week for the first column in the calendar."""
        root = self.cldr.load_supp('supplementalData')
        data = root.find('weekData')
        first = None
        for start in data.findall('firstDay'):
            territories = start.get('territories')
            day = start.get('day')

            # Throw out ones we don't care about.
            if start.get('alt') is not None:
                continue

            # TODO: Make this walk logic more robust/common.
            territories = territories.split()
            if '001' in territories:
                if first is None:
                    first = day
            if self.territory in territories:
                first = day

        return self._DAY_KEYS.index(first) + 1

    @cached_property
    def first_workday(self):
        """Number of day in the week for the first working day."""
        root = self.cldr.load_supp('supplementalData')
        data = root.find('weekData')
        first = None
        for start in data.findall('weekendEnd'):
            territories = start.get('territories')
            day = start.get('day')

            # TODO: Make this walk logic more robust/common.
            territories = territories.split()
            if '001' in territories:
                if first is None:
                    first = day
            if self.territory in territories:
                first = day

        return self._DAY_KEYS.index(first) + 1

    @cached_property
    def measurement(self):
        """Return 1 for metric and 2 for imperial"""
        root = self.cldr.load_supp('supplementalData')
        measurement = None
        for system in root.findall('measurementData/measurementSystem'):
            territories = system.get('territories')
            stype = system.get('type')

            # Throw out ones we don't care about.
            if system.get('category') == 'temperature' or stype == 'UK':
                continue

            # TODO: Make this walk logic more robust/common.
            territories = territories.split()
            if '001' in territories:
                if measurement is None:
                    measurement = stype
            if self.territory in territories:
                measurement = stype

        # We don't use imperial settings for Myanmar even though CLDR does.
        # https://en.wikipedia.org/wiki/Myanmar_units_of_measurement
        if self.territory == 'MM':
            if measurement == 'US':
                measurement = 'metric'
            else:
                raise ValueError('CLDR is updated; drop this hack')

        if measurement == 'metric':
            return '1'
        elif measurement == 'US':
            return '2'
        else:
            raise ValueError('Do not understand type %s' % measurement)

    @cached_property
    def measurement_copy(self):
        """We copy other locales for most"""
        if self.locale in ('en_US', 'i18n'):
            return None
        elif self.measurement == '1':
            return 'i18n'
        elif self.measurement == '2':
            return 'en_US'
        else:
            raise ValueError('Unknown measurement %s' % self.measurement)

    @cached_property
    def paper(self):
        """Return the paper type"""
        root = self.cldr.load_supp('supplementalData')
        paper = None
        for system in root.findall('measurementData/paperSize'):
            territories = system.get('territories')
            stype = system.get('type')

            # TODO: Make this walk logic more robust/common.
            territories = territories.split()
            if '001' in territories:
                if paper is None:
                    paper = stype
            if self.territory in territories:
                paper = stype

        return paper

    @cached_property
    def paper_height(self):
        """Return the height of paper (in mm)"""
        return {'A4': '297', 'US-Letter': '279'}.get(self.paper)

    @cached_property
    def paper_width(self):
        """Return the width of paper (in mm)"""
        return {'A4': '210', 'US-Letter': '216'}.get(self.paper)

    @cached_property
    def paper_copy(self):
        """We copy other locales for most"""
        if self.locale in ('en_US', 'i18n'):
            return None
        elif self.paper == 'A4':
            return 'i18n'
        elif self.paper == 'US-Letter':
            return 'en_US'
        else:
            raise ValueError('Unknown paper %s' % self.paper)


class Cldr(object):
    """Content for the cldr database."""

    # The current release version that we use.
    CURR_VERSION = '28'

    # Where to find the CLDR data.
    URI = 'http://unicode.org/Public/cldr/%(version)s/core.zip'

    def __init__(self, path, version):
        fields = {'version': version}
        self.dir = path % fields
        self.uri = self.URI % fields
        self.version = version
        self.date = None
        self.main_dbs = {}
        self.supp_dbs = {}
        self.iso639 = Iso639()

        # Set up the working dir.
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)

    def download(self):
        """Download the current cldr database."""
        # Download the CLDR data.
        archive = os.path.join(self.dir, 'core.zip')
        if not os.path.exists(archive):
            subprocess.check_call(['wget', '-O', archive, self.uri])
        self.date = datetime.datetime.fromtimestamp(os.path.getmtime(archive))

        # Unpack the CLDR data.
        common_dir = os.path.join(self.dir, 'common')
        if not os.path.exists(common_dir):
            subprocess.check_call(['unzip', '-u', 'core.zip'], cwd=self.dir)

    def _load_db(self, db, subdir, cache):
        """Load the database |db| out of |subdir| using |cache|."""
        if db not in cache:
            db_path = os.path.join(self.dir, 'common', subdir, '%s.xml' % db)
            tree = ElementTree.parse(db_path)
            cache[db] = tree.getroot()
        return cache[db]

    def _load_main(self, db):
        """Load database |db| from the main repo."""
        return self._load_db(db, 'main', self.main_dbs)

    def load_lang(self, lang):
        """Load the language |lang| database."""
        return self._load_main(lang)

    def load_supp(self, db):
        """Load database |db| from the supplemental repo."""
        return self._load_db(db, 'supplemental', self.supp_dbs)

    def locale(self, locale):
        """Get an object for a specific cldr |locale|."""
        return CldrLocale(self, locale, self.iso639)


class LocaleError(Exception):
    """Error w/Locale objects"""


class LocaleCategory(object):
    """Content for a single locale category."""

    def __init__(self, name='', content=(), header=()):
        self.name = name.lower()
        self.content = content
        self.header = header

    def __str__(self):
        padding = '\n' if REWRITE_STYLE else ''
        ret = ''
        if self.header:
            ret += padding + '\n'.join(self.header) + '\n'
        lc_name = self.name.upper()
        ret += (padding +
                '\n'.join([lc_name] + self.content + ['END %s' % lc_name]) +
                '\n')
        return ret


class Locale(object):
    """Content for a locale file itself."""

    def __init__(self, name=None, path=None):
        self.name = name
        self.header = []
        self.lc_identification = []
        self.lc_ctype = []
        self.lc_collate = []
        self.lc_time = []
        self.lc_numeric = []
        self.lc_monetary = []
        self.lc_messages = []
        self.lc_paper = []
        self.lc_name = []
        self.lc_address = []
        self.lc_telephone = []
        self.lc_measurement = []
        self.categories = []
        self.cldr = None

        if path is not None:
            self.read(path)

    @staticmethod
    def _trim_extra_lines(lines, leading=True, trailing=True,
                          consecutive=True, comments=False):
        """Helper to clean up the style of the data files."""
        if not REWRITE_STYLE:
            return lines

        # Clear leading blank lines.
        if leading:
            while lines and not lines[0]:
                lines.pop(0)

        # Clear trailing blank lines.
        if trailing:
            while lines and not lines[-1]:
                lines.pop(-1)

        # Clear consecutive blank lines.
        if consecutive:
            i = 0
            while i < len(lines) - 1:
                if not lines[i] and not lines[i + 1]:
                    lines.pop(i)
                else:
                    i += 1

        # Trim blank comment lines that start/end a section.
        if comments:
            i = 0
            while i < len(lines):
                if (lines[i] == '%' and
                        (i == 0 or not lines[i - 1] or lines[i - 1][0] != '%')):
                    lines.pop(i)
                elif (lines[i] == '%' and
                      (i == len(lines) - 1 or not lines[i + 1] or
                       lines[i + 1][0] != '%')):
                    lines.pop(i)
                else:
                    i += 1

        return lines

    def readfp(self, fp):
        """Load the locale content from |fp|"""
        lines = [x.rstrip() for x in fp.readlines()]
        self._trim_extra_lines(lines)

        # Process the leading few lines.
        comment_line = lines.pop(0)
        escape_line = lines.pop(0)
        self.header = [comment_line, escape_line]
        if escape_line.startswith('comment_char'):
            escape_line, comment_line = comment_line, escape_line

        line = comment_line
        if line.startswith('comment_char'):
            if line.split()[1] != '%':
                raise LocaleError('Bad comment_char: %s' % line)
        else:
            raise LocaleError('Second line should be comment_char, not %s' %
                              line)

        line = escape_line
        if line.startswith('escape_char'):
            if line.split()[1] != '/':
                raise LocaleError('Bad escape_char: %s' % line)
        else:
            raise LocaleError('First line should be escape_char, not %s' % line)

        # Now walk each locale category.
        while lines:
            # Extract any leading comments.
            header = []
            while lines:
                line = lines[0]
                if line.startswith('LC_'):
                    break
                elif not line or line[0] == '%':
                    header.append(line)
                    lines.pop(0)
                    continue
                else:
                    break
            self._trim_extra_lines(header)

            if not lines:
                if header:
                    print('Throwing away trailing lines: %r' % header,
                          file=sys.stderr)
                return

            line = lines.pop(0)
            if line[0:3] != 'LC_':
                raise LocaleError('Bad line state: %s' % line)

            cat = line.lower()
            cat_lines = []
            while lines:
                line = lines.pop(0)
                if line == 'END %s' % cat.upper():
                    break
                cat_lines.append(line)
            self._trim_extra_lines(cat_lines)
            lc = LocaleCategory(name=cat, content=cat_lines, header=header)
            setattr(self, cat, lc)
            self.categories.append(cat)

    def read(self, path):
        """Load the locale file from |path|"""
        self.readfp(open(path))

    def writefp(self, fp):
        """Write the locale content to |fp|"""
        if REWRITE_STYLE:
            header = ['comment_char %', 'escape_char /']
        else:
            header = self.header
        fp.write('\n'.join(header) + '\n')

        for category in self.categories:
            lc = getattr(self, category)
            fp.write(str(lc))

    def write(self, path):
        """Write the locale content to |path|"""
        self.writefp(open(path, 'w'))

    def update_cldr(self, cldr):
        """Merge CLDR updates in to this locale."""
        try:
            cldr_locale = cldr.locale(self.name)
        except OSError as e:
            if e.errno == errno.ENOENT:
                return
            raise

        # Start updating the actual data.
        cldr_values = {
            'generator': os.path.basename(__file__),
            'english_lang_name': cldr_locale.en_lang,
            'english_territory_name': cldr_locale.en_territory,
            'source_name': 'Unicode Common Locale Data Repository (CLDR)',
            'source_version': cldr.version,
            'source_uri': cldr.uri.replace('/', '//'),
            'source_date': cldr.date.strftime('%Y-%m-%d'),
            'lang': cldr_locale.lang,
            'territory': cldr_locale.territory,
            'locale': cldr_locale.locale,
        }

        all_values = {}
        all_values['lc_identification'] = {
            #'title':     ('%(english_lang_name)s language locale for '
            #              '%(english_territory_name)s'),
            #'source':    '%(source_name)s',
            #'address':   '%(source_uri)s',
            #'contact':   'http:////cldr.unicode.org//index//process',
            #'email':     'bug-glibc-locales@gnu.org',
            'tel':       '',
            'fax':       '',
            'language':  '%(english_lang_name)s',
            'territory': '%(english_territory_name)s',
            #'revision':  '%(source_version)s',
            #'date':      '%(source_date)s',
        }
        # These are based on the charset, not the locale.
        all_values['lc_ctype'] = {}
        all_values['lc_collate'] = {}
        all_values['lc_time'] = {
            #'abday': cldr_locale.abday,
            #'day': cldr_locale.day,
            #'abmon': cldr_locale.abmon,
            #'mon': cldr_locale.mon,
            #'am_pm': cldr_locale.am_pm,
            #'d_t_fmt': cldr_locale.d_t_fmt,
            #'d_fmt': cldr_locale.d_fmt,
            #'t_fmt': cldr_locale.t_fmt,
            #'t_fmt_ampm': cldr_locale.t_fmt_ampm,
            #'date_fmt': cldr_locale.date_fmt,
            #'week': cldr_locale.week,
            #'first_weekday': int(cldr_locale.first_weekday),
            #'first_workday': int(cldr_locale.first_workday),
        }
        all_values['lc_numeric'] = {
            #'decimal_point': cldr_locale.decimal_point,
            #'thousands_sep': cldr_locale.thousands_sep,
            'grouping': cldr_locale.grouping,
        }
        all_values['lc_monetary'] = {
            #'int_curr_symbol': cldr_locale.int_curr_symbol + ' ',
            #'currency_symbol': cldr_locale.currency_symbol,
        }
        all_values['lc_messages'] = {
        }
        all_values['lc_paper'] = {
            'paper_height': int(cldr_locale.paper_height),
            'paper_width': int(cldr_locale.paper_width),
            #'copy': cldr_locale.paper_copy,
        }
        # XXX: Need a data source for this.
        all_values['lc_name'] = {
        }
        all_values['lc_address'] = {
            #'postal_fmt':
            'country_name': cldr_locale.country_name,
            #'country_post':
            'country_ab2': cldr_locale.country_ab2,
            'country_ab3': cldr_locale.country_ab3,
            'country_num': int(cldr_locale.country_num),
            #'country_car':
            #'country_isbn':
            'lang_name': cldr_locale.lang_name,
            'lang_ab': cldr_locale.lang,
            'lang_term': cldr_locale.lang_term,
            'lang_lib': cldr_locale.lang_lib,
        }
        all_values['lc_telephone'] = {
            'tel_int_fmt': cldr_locale.tel_int_fmt,
            'tel_dom_fmt': cldr_locale.tel_dom_fmt,
            'int_select': cldr_locale.int_select,
            'int_prefix': cldr_locale.int_prefix,
        }
        all_values['lc_measurement'] = {
            'measurement': int(cldr_locale.measurement),
            'copy': cldr_locale.measurement_copy,
        }

        # Walk all the categories.
        for category in self.categories:
            lc = getattr(self, category)
            values = all_values[category]
            if not values:
                continue

            # Walk each line in this locale category.
            start_of_line = None
            full_line = ''
            i = 0
            while i < len(lc.content):
                line = lc.content[i]
                if not line:
                    i += 1
                    continue

                # If the line ends with / it is wrapped, so unwrap it before
                # we check for updates to the value.
                if line.endswith('/'):
                    if not full_line:
                        start_of_line = i
                    full_line += line[:-1].lstrip()
                    i += 1
                    continue
                elif full_line:
                    line = full_line + line.lstrip()
                    full_line = ''
                else:
                    start_of_line = None

                # Process this line.
                key = line.split()[0]
                new_value = values.get(key)
                if new_value is not None:
                    is_int = isinstance(new_value, int)
                    if is_int:
                        new_value = str(new_value)
                        m = re.match(r'\s*(.*?)\s+([0-9]+)$', line)
                    else:
                        if isinstance(new_value, (tuple, list, set)):
                            new_value = '";"'.join(u_encode(x % cldr_values)
                                                   for x in new_value)
                        elif key != 'copy':
                            new_value %= cldr_values
                            if category != 'lc_identification':
                                new_value = u_encode(new_value)
                        m = re.match(r'\s*([^"]*)"(.*)"$', line)

                    if m:
                        # We should standardize case at some point.
                        if new_value.lower() != m.group(2).lower():
                            disp_key = ('%s:%s' % (category.upper(), key)
                                        if key == 'copy' else key)
                            logging.info('%s: %s: changing {%s} to {%s}',
                                         self.name, disp_key,
                                         u_decode(m.group(2)),
                                         u_decode(new_value))
                            leading_line = m.group(1)

                            # This is tricky as we have to delete most of the
                            # multiline, then update the one remaining.
                            if start_of_line is not None:
                                for _ in range(start_of_line, i):
                                    lc.content.pop(start_of_line)
                                i = start_of_line
                                if '";"' in new_value:
                                    leading_line = leading_line.rstrip() + '\t'
                                    num_tabs = (len(leading_line) // 8) + 1
                                    new_value = new_value.replace(
                                        '";"',
                                        '";/\n' + ('\t' * num_tabs) + '"')

                            fmt = '%s %s' if is_int else '%s"%s"'
                            lc.content[i] = fmt % (leading_line, new_value)

                i += 1


def main(argv):
    """The main entry point."""
    parser = get_parser()
    opts = parser.parse_args(argv)
    logging_init(opts)

    # Get a handle to the cldr database.
    cldr = Cldr(opts.working_dir, opts.version)
    cldr.download()

    # Process all the locales the user told us to.
    for locale in opts.locales:
        name = os.path.basename(locale)
        # Skip a few known "bad" locales.
        if name.split('_', 1)[0] in ('iso14651', 'translit', 'C', 'POSIX'):
            continue

        logging.info('Updating %s', locale)
        try:
            loc = Locale(name=name, path=locale)
            try:
                loc.update_cldr(cldr)
            except Exception:
                logging.error('%s: updating failed', locale, exc_info=True)
            loc.write(locale + '.new')
            os.rename(locale + '.new', locale)
        except UnicodeDecodeError:
            logging.error('%s: bad encodings', locale, exc_info=True)
            subprocess.check_call(['file', locale])
        except (IndexError, LocaleError):
            logging.error('%s: loading failed', locale, exc_info=True)


if __name__ == '__main__':
    exit(main(sys.argv[1:]))

Attachment: signature.asc
Description: Digital signature

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]