@@ -0,0 +1,568 @@
+# -*- coding: utf-8 -*-
+# Written by Mike Frysinger <vapier@gentoo.org> for much great glory.
+#
+# Copyright (C) 2016 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Helper library for working with locale datafiles."""
+
+from __future__ import print_function
+
+import os
+import re
+import sys
+
+
+# Whether we should clean up newlines/comments.
+REWRITE_STYLE = False
+
+# Comment block that should be at the top of all files.
+FILE_HEADER = """\
+% This file is part of the GNU C Library and contains locale data.
+% The Free Software Foundation does not claim any copyright interest
+% in the locale data contained in this file. The foregoing does not
+% affect the license of the GNU C Library as a whole. It does not
+% exempt you from the conditions of the license if your use would
+% otherwise be governed by that license.
+
+"""
+
+# The order of content in the data files.
+CATEGORY_ORDER = (
+ 'LC_IDENTIFICATION',
+ 'LC_CTYPE',
+ 'LC_COLLATE',
+ 'LC_MONETARY',
+ 'LC_NUMERIC',
+ 'LC_TIME',
+ 'LC_MESSAGES',
+ 'LC_PAPER',
+ 'LC_NAME',
+ 'LC_ADDRESS',
+ 'LC_TELEPHONE',
+ 'LC_MEASUREMENT',
+)
+
+
+def u_encode(text):
+ """Convert unicode |text| to <U####> format."""
+ return ''.join('<U%04X>' % ord(x) for x in text)
+
+
+_U_MATCH = re.compile(r'<U([0-9A-Fa-f]+)>')
+def u_decode(text):
+ """Convert <U####> format in |text|."""
+ unirep = lambda m: chr(int(m.group(1), 16))
+ return _U_MATCH.sub(unirep, text)
+
+
+def dequote(text):
+ """Remove leading/trailing quotes."""
+ if text[0] == '"':
+ return text[1:-1]
+ else:
+ return text
+
+
+class LocaleError(Exception):
+ """Error w/Locale objects"""
+
+
+class LocaleName(object):
+ """Locale name object."""
+
+ # We support the POSIX format: <lang>_<territory>@<alt>
+ _POSIX_RE = re.compile(
+ r'(?P<lang>[^_]*)'
+ r'(_(?P<territory>[^@]*)'
+ r'(@(?P<alt>.*))?)?')
+
+ # Script mappings between POSIX & CLDR.
+ _SCRIPT_MAP_PC = {
+ 'cyrillic': 'Cyrl',
+ 'latin': 'Latn',
+ }
+ # Handle languages that default to a specific script.
+ _SCRIPT_MAP_LANG_PC = {
+ # XXX: Is this correct for zh_HK ? What about Hant ?
+ # What about the cmn language (e.g. cmn_TW) ?
+ 'zh': 'Hans',
+ }
+ #_SCRIPT_MAP_CP = dict((v, k) for k, v in _SCRIPT_MAP_PC)
+
+ def __init__(self, name):
+ """A new locale name in POSIX format."""
+ self.name = name
+ m = self._POSIX_RE.match(name)
+ self.lang = m.group('lang')
+ self.territory = m.group('territory')
+ self.alt = m.group('alt')
+
+ def __str__(self):
+ return self.posix
+
+ @property
+ def posix(self):
+ """Name of locale as POSIX uses it."""
+ ret = self.lang
+ if self.territory:
+ ret += '_' + self.territory
+ if self.alt:
+ ret += '@' + self.alt
+ return ret
+
+ @property
+ def cldr_lang(self):
+ """Name of language as CLDR uses it."""
+ ret = self.lang
+
+ script = self._SCRIPT_MAP_PC.get(self.alt)
+ if not script:
+ script = self._SCRIPT_MAP_LANG_PC.get(self.lang)
+ if script:
+ ret += '_' + script
+
+ return ret
+
+ @property
+ def cldr(self):
+ """Name of locale as CLDR uses it."""
+ # First deal with ugly variants.
+ if (self.lang, self.territory, self.alt) == ('ca', 'ES', 'valencia'):
+ return 'ca_ES_VALENCIA'
+
+ ret = self.cldr_lang
+ if self.territory:
+ ret += '_' + self.territory
+ return ret
+
+
+class LocaleCategory(object):
+ """Content for a single locale category."""
+
+ FIELDS = ()
+
+ def __init__(self, name='', content=(), header=(), comment_char=None,
+ copies={}):
+ self.name = name.lower()
+ self.content = content
+ self.header = header
+ self.fields = dict((k, None) for k in self.FIELDS)
+
+ self._merge_content(content, comment_char, copies)
+
+ def _merge_content(self, content, comment_char, copies):
+ for line in content:
+ line = line.split(None, 1)
+ if len(line) == 2:
+ k, v = line
+ if k in self.FIELDS:
+ if comment_char:
+ v = v.rsplit(comment_char, 1)[0].rstrip()
+ self.fields[k] = u_decode(dequote(v))
+ if k == 'copy':
+ if comment_char:
+ v = v.rsplit(comment_char, 1)[0].rstrip()
+ v = u_decode(dequote(v))
+ copy = copies[v]
+ if (isinstance(copy, Locale) and
+ self.NAME in copy.categories):
+ self._merge_content(getattr(copy, self.name).content,
+ copy.comment_char, copies)
+
+ def __str__(self):
+ padding = '\n' if REWRITE_STYLE else ''
+ ret = ''
+ if self.header:
+ ret += padding + '\n'.join(self.header) + '\n'
+ lc_name = self.name.upper()
+ ret += (padding +
+ '\n'.join([lc_name] + self.content + ['END %s' % lc_name]) +
+ '\n')
+ return ret
+
+
+class LCIdentification(LocaleCategory):
+ """LC_IDENTIFICATION object."""
+
+ NAME = 'LC_IDENTIFICATION'
+ FIELDS = (
+ 'title',
+ 'source',
+ 'address',
+ 'contact',
+ 'email',
+ 'tel',
+ 'fax',
+ 'language',
+ 'territory',
+ 'audience',
+ 'application',
+ 'abbreviation',
+ 'revision',
+ 'date',
+ 'category',
+ )
+
+
+class LCCtype(LocaleCategory):
+ """LC_CTYPE object."""
+
+ NAME = 'LC_CTYPE'
+ FIELDS = (
+ )
+
+
+class LCCollate(LocaleCategory):
+ """LC_COLLATE object."""
+
+ NAME = 'LC_COLLATE'
+ FIELDS = (
+ )
+
+
+class LCMonetary(LocaleCategory):
+ """LC_MONETARY object."""
+
+ NAME = 'LC_MONETARY'
+ FIELDS = (
+ 'int_curr_symbol',
+ 'currency_symbol',
+ 'mon_decimal_point',
+ 'mon_thousands_sep',
+ 'mon_grouping',
+ 'positive_sign',
+ 'negative_sign',
+ 'int_frac_digits',
+ 'frac_digits',
+ 'p_cs_precedes',
+ 'p_sep_by_space',
+ 'n_cs_precedes',
+ 'n_sep_by_space',
+ 'p_sign_posn',
+ 'n_sign_posn',
+ 'int_p_cs_precedes',
+ 'int_n_cs_precedes',
+ 'int_p_sep_by_space',
+ 'int_n_sep_by_space',
+ 'int_p_sign_posn',
+ 'int_n_sign_posn',
+ )
+
+
+class LCNumeric(LocaleCategory):
+ """LC_NUMERIC object."""
+
+ NAME = 'LC_NUMERIC'
+ FIELDS = (
+ 'decimal_point',
+ 'thousands_sep',
+ 'grouping',
+ )
+
+
+class LCTime(LocaleCategory):
+ """LC_TIME object."""
+
+ NAME = 'LC_TIME'
+ FIELDS = (
+ 'abday',
+ 'day',
+ 'abmon',
+ 'mon',
+ 'am_pm',
+ 'd_t_fmt',
+ 'd_fmt',
+ 't_fmt',
+ 't_fmt_ampm',
+ 'era',
+ 'era_year',
+ 'era_d_fmt',
+ 'alt_digits',
+ 'era_d_t_fmt',
+ 'era_t_fmt',
+ 'week',
+ 'first_weekday',
+ 'first_workday',
+ 'cal_direction',
+ 'date_fmt',
+ )
+
+
+class LCMessages(LocaleCategory):
+ """LC_MESSAGES object."""
+
+ NAME = 'LC_MESSAGES'
+ FIELDS = (
+ 'yesexpr',
+ 'noexpr',
+ 'yesstr',
+ 'nostr',
+ )
+
+
+class LCPaper(LocaleCategory):
+ """LC_PAPER object."""
+
+ NAME = 'LC_PAPER'
+ FIELDS = (
+ 'height',
+ 'width',
+ )
+
+
+class LCName(LocaleCategory):
+ """LC_NAME object."""
+
+ NAME = 'LC_NAME'
+ FIELDS = (
+ 'name_fmt',
+ 'name_gen',
+ 'name_mr',
+ 'name_mrs',
+ 'name_miss',
+ 'name_ms',
+ )
+
+
+class LCAddress(LocaleCategory):
+ """LC_ADDRESS object."""
+
+ NAME = 'LC_ADDRESS'
+ FIELDS = (
+ 'postal_fmt',
+ 'country_name',
+ 'country_post',
+ 'country_ab2',
+ 'country_ab3',
+ 'country_car',
+ 'country_num',
+ 'country_isbn',
+ 'lang_name',
+ 'lang_ab',
+ 'lang_term',
+ 'lang_lib',
+ )
+
+
+class LCTelephone(LocaleCategory):
+ """LC_TELEPHONE object."""
+
+ NAME = 'LC_TELEPHONE'
+ FIELDS = (
+ 'tel_int_fmt',
+ 'tel_dom_fmt',
+ 'int_select',
+ 'int_prefix',
+ )
+
+
+class LCMeasurement(LocaleCategory):
+ """LC_MEASUREMENT object."""
+
+ NAME = 'LC_MEASUREMENT'
+ FIELDS = (
+ 'measurement',
+ )
+
+
+class Locale(object):
+ """Content for a locale file itself."""
+
+ _COPY_CACHE = {}
+
+ def __init__(self, name=None, path=None):
+ self.name = name
+ self.path = path
+ self.locale = LocaleName(name)
+ self.header = [] #FILE_HEADER.splitlines()
+ for cat in CATEGORY_ORDER:
+ setattr(self, cat.lower(), None)
+ self.categories = []
+ self.cldr = None
+ self.escape_char = '\\'
+ self.comment_char = '#'
+
+ if path is not None:
+ self.read(path)
+
+ @staticmethod
+ def _trim_extra_lines(lines, leading=True, trailing=True,
+ consecutive=True, comments=False):
+ """Helper to clean up the style of the data files."""
+ if not REWRITE_STYLE:
+ return lines
+
+ # Clear leading blank lines.
+ if leading:
+ while lines and not lines[0]:
+ lines.pop(0)
+
+ # Clear trailing blank lines.
+ if trailing:
+ while lines and not lines[-1]:
+ lines.pop(-1)
+
+ # Clear consecutive blank lines.
+ if consecutive:
+ i = 0
+ while i < len(lines) - 1:
+ if not lines[i] and not lines[i + 1]:
+ lines.pop(i)
+ else:
+ i += 1
+
+ # Trim blank comment lines that start/end a section.
+ if comments:
+ i = 0
+ while i < len(lines):
+ if (lines[i] == '%' and
+ (i == 0 or not lines[i - 1] or lines[i - 1][0] != '%')):
+ lines.pop(i)
+ elif (lines[i] == '%' and
+ (i == len(lines) - 1 or not lines[i + 1] or
+ lines[i + 1][0] != '%')):
+ lines.pop(i)
+ else:
+ i += 1
+
+ return lines
+
+ def readfp(self, fp):
+ """Load the locale content from |fp|"""
+ Locale._COPY_CACHE[self.locale] = self
+
+ lines = [x.rstrip() for x in fp.readlines()]
+ self._trim_extra_lines(lines)
+
+ # Process the leading few lines.
+ comment_line = 'comment_char %'
+ if lines[0] != comment_line:
+ print('%s: warning: first line should be: "%s", not "%s"' %
+ (self.locale, comment_line, lines[0]))
+ escape_line = 'escape_char /'
+ if lines[1] != escape_line:
+ print('%s: warning: second line should be: "%s", not "%s"' %
+ (self.locale, escape_line, lines[0]))
+
+ # Now walk each locale category.
+ while lines:
+ # Extract any leading comments.
+ header = []
+ while lines:
+ line = lines[0]
+ if line.startswith('LC_'):
+ break
+ elif not line or line[0] == self.comment_char:
+ header.append(line)
+ lines.pop(0)
+ continue
+ elif line.startswith('comment_char'):
+ self.comment_char = line.split()[1]
+ if self.comment_char != '%':
+ raise LocaleError('%s: bad comment_char: %s' %
+ (self.locale, line))
+ header.append(line)
+ lines.pop(0)
+ continue
+ elif line.startswith('escape_char'):
+ self.escape_char = line.split()[1]
+ if self.escape_char != '/':
+ raise LocaleError('%s: bad escape_char: %s' %
+ (self.locale, line))
+ header.append(line)
+ lines.pop(0)
+ continue
+ else:
+ break
+ self._trim_extra_lines(header)
+
+ if not lines:
+ if header:
+ print('%s: throwing away trailing lines: %r' %
+ (self.name, header), file=sys.stderr)
+ return
+
+ line = lines.pop(0)
+ if line[0:3] != 'LC_':
+ raise LocaleError('%s: bad line state: %s' % (self.name, line))
+
+ cat = line.split()[0]
+ if cat not in CATEGORY_ORDER:
+ raise LocaleError('%s: unknown cateogry: %s' % (self.name, cat))
+
+ cat_lines = []
+ full_line = ''
+ while lines:
+ # Accumulate multilines.
+ line = lines.pop(0)
+ if line.endswith(self.escape_char):
+ full_line += line[:-1]
+ continue
+ elif full_line:
+ line = full_line + line.lstrip()
+ full_line = ''
+
+ # Halt when we get to the end of this category.
+ if line.split()[0:2] == ['END', cat]:
+ break
+ cat_lines.append(line)
+
+ # Deal with loading other locales.
+ if line.startswith('copy '):
+ copy = u_decode(dequote(line.split()[1]))
+ self._load_copy(copy)
+
+ self._trim_extra_lines(cat_lines)
+ lc_obj_name = 'LC%s%s' % (cat[3], cat[4:].lower())
+ lc_obj = getattr(sys.modules[__name__], lc_obj_name)
+ lc = lc_obj(name=cat, content=cat_lines, header=header,
+ comment_char=self.comment_char, copies=self._COPY_CACHE)
+ setattr(self, cat.lower(), lc)
+ self.categories.append(cat)
+
+ def read(self, path):
+ """Load the locale file from |path|"""
+ self.readfp(open(path))
+
+ def _load_copy(self, copy):
+ """Load the locale named by |copy|"""
+ if not self.path:
+ return
+ if copy in Locale._COPY_CACHE:
+ return
+ # Flag it as in progress to avoid loops.
+ path = os.path.join(os.path.dirname(self.path), copy)
+ Locale._COPY_CACHE[copy] = 'loading'
+ Locale._COPY_CACHE[copy] = Locale(name=copy, path=path)
+
+ def writefp(self, fp):
+ """Write the locale content to |fp|"""
+ if REWRITE_STYLE:
+ header = ['comment_char %', 'escape_char /']
+ else:
+ header = self.header
+ if header:
+ fp.write('\n'.join(header) + '\n')
+
+ for category in self.categories:
+ lc = getattr(self, category.lower())
+ fp.write(str(lc))
+
+ def write(self, path):
+ """Write the locale content to |path|"""
+ self.writefp(open(path, 'w'))
@@ -0,0 +1,446 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# Written by Mike Frysinger <vapier@gentoo.org> for much great glory.
+#
+# Copyright (C) 2016 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Linting tool for locale datafiles."""
+
+# TODO: Validate set of locale data files and SUPPORTED file.
+
+from __future__ import print_function
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+
+import locales
+
+
+def get_parser():
+ """Return an argument parser for this module."""
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('locales', nargs='*', help='Locales to lint')
+ return parser
+
+
+class Check(object):
+ """Check class for locale problems."""
+
+ def __init__(self, locale, lc):
+ self.locale = locale
+ self.lc = lc
+ self.failed = False
+
+ def fail(self, key, msg):
+ """Set state to failed and show |msg| for |key|."""
+ self.failed = True
+ print('ERROR: %s: %s.%s: %s' % (self.locale.name, self.lc.NAME, key, msg))
+
+ def assertTrue(self, key, value, msg):
+ """Verify |value| is a boolean True value."""
+ if not value:
+ self.fail(key, msg)
+
+ def assertIn(self, key, value, exp_set, msg):
+ """Verify |value| is in |exp_set|."""
+ self.assertTrue(key, value in exp_set, msg)
+
+ def assertEqual(self, key, value, exp_value, msg):
+ """Verify |value| is equal to |exp_value|."""
+ self.assertTrue(key, value == exp_value, msg)
+
+ def assertNotEqual(self, key, value, exp_value, msg):
+ """Verify |value| is not equal to |exp_value|."""
+ self.assertTrue(key, value != exp_value, msg)
+
+ def assertDefined(self, key, value):
+ """Verify |value| is set to something (allows empty string)."""
+ if value is None:
+ self.fail(key, 'missing definition')
+
+ def assertNonEmpty(self, key, value):
+ """Verify |value| is set to a non-empty value."""
+ if not value:
+ self.fail(key, 'missing value')
+
+ def assertEmpty(self, key, value):
+ """Verify |value| is set to an empty value."""
+ if value:
+ self.fail(key, 'value should be left empty')
+
+ def assertFormat(self, key, value, exp_formats, msg):
+ m = re.match(r'%%[^%s]' % (exp_formats,), value)
+ if m:
+ self.fail(key, msg)
+
+
+class CheckLCIdentification(Check):
+ """Check LC_IDENTIFICATION object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ self.assertNonEmpty('email', lc.fields['email'])
+
+ self.assertEmpty('tel', lc.fields['tel'])
+ self.assertEmpty('fax', lc.fields['fax'])
+
+ # TODO: Check language & territory.
+ # TODO: Check category fields are one of:
+ # i18n:2002 posix:1993
+
+
+class CheckLCCtype(Check):
+ """Check LC_CTYPE object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+
+class CheckLCCollate(Check):
+ """Check LC_COLLATE object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+
+class CheckLCMonetary(Check):
+ """Check LC_MONETARY object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ k = 'int_curr_symbol'
+ v = lc.fields[k]
+ self.assertDefined(k, v)
+ if v:
+ self.assertEqual(k, len(v), 4,
+ 'symbol should be 4 characters, not %s' % (v,))
+ # TODO: We can validate the value against ISO 4217.
+ self.assertEqual(k, v[3], ' ',
+ 'symbol must end with a space, not %s' % (v[3],))
+
+ for k in ('currency_symbol', 'mon_decimal_point', 'mon_thousands_sep',
+ 'positive_sign', 'negative_sign', 'mon_grouping',
+ 'int_frac_digits', 'frac_digits'):
+ self.assertDefined(k, lc.fields[k])
+
+ # XXX: The value of -1 is permitted for the POSIX locale.
+
+ valid_values = (None, '0', '1')
+ for k in ('p_cs_precedes', 'n_cs_precedes', 'int_p_cs_precedes', 'int_n_cs_precedes'):
+ v = lc.fields[k]
+ self.assertIn(k, v, valid_values,
+ 'should be 0 or 1, not %s' % (v,))
+
+ valid_values = (None, '0', '1', '2')
+ for k in ('p_sep_by_space', 'n_sep_by_space', 'int_p_sep_by_space', 'int_n_sep_by_space'):
+ v = lc.fields[k]
+ self.assertIn(k, v, valid_values,
+ 'should be between [0, 2], not %s' % (v,))
+
+ valid_values = (None, '0', '1', '2', '3', '4')
+ for k in ('p_sign_posn', 'n_sign_posn', 'int_p_sign_posn', 'int_n_sign_posn'):
+ v = lc.fields[k]
+ self.assertIn(k, v, valid_values,
+ 'should be between [0, 4], not %s' % (v,))
+
+
+class CheckLCNumeric(Check):
+ """Check LC_NUMERIC object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ # TODO: grouping: Verify it's a list of positive ints (and -1).
+ k = 'grouping'
+ self.assertDefined(k, lc.fields[k])
+
+ k = 'decimal_point'
+ self.assertNonEmpty(k, lc.fields[k])
+
+
+class CheckLCTime(Check):
+ """Check LC_TIME object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ valid_len = 7
+ for k in ('abday', 'day'):
+ v = lc.fields[k]
+ if v:
+ v = v.split(';')
+ self.assertEqual(k, len(v), valid_len,
+ 'need %s elements: %s' % (valid_len, v))
+
+ valid_len = 12
+ for k in ('abmon', 'mon'):
+ v = lc.fields[k]
+ if v:
+ v = v.split(';')
+ self.assertEqual(k, len(v), valid_len,
+ 'need %s elements: %s' % (valid_len, v))
+
+ k = 'date_fmt'
+ default_value = '%a %b %e %H:%M:%S %Z %Y'
+ v = lc.fields[k]
+ self.assertNotEqual(k, v, default_value,
+ 'value (%s) is same as the default; delete it' % (v,))
+
+ # Should we filter out date/time fields rather than allow each one full
+ # access to the strftime api?
+ valid_values = '-aAbBcCdDeEFgGhHIjklmMnOpPrRsStTuUVwWxXyYzZ'
+ for k in ('d_t_fmt', 'd_fmt', 't_fmt'):
+ v = lc.fields[k]
+ if v:
+ self.assertFormat(k, v, valid_values,
+ 'only %s formats are accepted, not %s' % (valid_values, v))
+
+ # TODO: am_pm: Verify it has 2 entries.
+
+ k = 'week'
+ v = lc.fields[k]
+ if v:
+ default_week = '7;19971130;4'
+ if v == default_week:
+ self.fail(k, 'value (%s) is same as the default; delete it' % (default_week,))
+
+ k = 'first_weekday'
+ self.assertNotEqual(k, lc.fields[k], '1', 'value (%s) is same as the default; delete it' % (v,))
+
+ k = 'first_workday'
+ self.assertNotEqual(k, lc.fields[k], '2', 'value (%s) is same as the default; delete it' % (v,))
+ else:
+ va = v.split(';')
+ if len(va) != 3:
+ self.fail(k, 'value should have 3 fields, not %s' % (v,))
+ else:
+ default_start = default_week.split(';')[1]
+ self.assertEqual(k, va[1], default_start,
+ 'should be %s, not %s (remember to adjust other fields too)' % (default_start, va[1]))
+
+ k = 'first_weekday'
+ v = lc.fields[k]
+ valid_values = (None, '1', '2')
+ self.assertIn(k, v, valid_values,
+ 'should be 1 or 2, not %s' % (v,))
+
+ k = 'first_workday'
+ v = lc.fields[k]
+ valid_values = (None, '1', '2')
+ self.assertIn(k, v, valid_values,
+ 'should be 1 or 2, not %s' % (v,))
+
+ k = 'cal_direction'
+ v = lc.fields[k]
+ valid_values = (None, '1', '2', '3')
+ self.assertIn(k, v, valid_values,
+ 'should be between [0, 3], not %s' % (v,))
+
+
+class CheckLCMessages(Check):
+ """Check LC_MESSAGES object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ for k in ('yesexpr', 'noexpr'):
+ v = lc.fields[k]
+ if v:
+ try:
+ re.compile(v)
+ except re.error:
+ self.fail(k, 'invalid regular expression: %s' % (v,))
+
+
+class CheckLCPaper(Check):
+ """Check LC_PAPER object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ paper = (lc.fields['height'], lc.fields['width'])
+ valid_values = (
+ ('279', '216'), # US-Letter.
+ ('297', '210'), # A4.
+ # XXX: Drop this? Need to implement copy directives.
+ (None, None), # Not set.
+ )
+ self.assertIn('(height, width)', paper, valid_values,
+ '%r' % (paper,))
+
+
+class CheckLCName(Check):
+ """Check LC_NAME object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ k = 'name_fmt'
+ v = lc.fields[k]
+ self.assertNonEmpty(k, v)
+ # Same value as ld-name.c.
+ valid_values = 'dfFgGlomMpsSt'
+ if v:
+ self.assertFormat(k, v, valid_values,
+ 'only %s formats are accepted, not %s' % (valid_values, v))
+
+ for k in ('name_gen', 'name_mr', 'name_mrs', 'name_miss', 'name_ms'):
+ self.assertDefined(k, lc.fields[k])
+
+
+class CheckLCAddress(Check):
+ """Check LC_ADDRESS object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ k = 'postal_fmt'
+ v = lc.fields[k]
+ self.assertNonEmpty(k, v)
+ # Same value as ld-address.c.
+ valid_values = 'afdbshNtreCzTSc%'
+ if v:
+ self.assertFormat(k, v, valid_values,
+ 'only %s formats are accepted, not %s' % (valid_values, v))
+
+ k = 'country_ab2'
+ v = lc.fields[k]
+ self.assertDefined(k, v)
+ if v:
+ self.assertEqual(k, len(v), 2, 'must be 2 letters, not %s' % (v,))
+
+ # XXX: We can validate lang_ab more.
+ k = 'lang_ab'
+ v = lc.fields[k]
+ if len(self.locale.locale.lang) == 2:
+ self.assertDefined(k, v)
+ if v:
+ self.assertEqual(k, len(v), 2, 'must be 2 letters, not %s' % (v,))
+ self.assertEqual(k, v, v.lower(), 'must be lowercase, not %s' % (v,))
+
+ for k in ('country_ab3', 'lang_term', 'lang_lib'):
+ v = lc.fields[k]
+ self.assertDefined(k, v)
+ if v:
+ self.assertEqual(k, len(v), 3, 'must be 3 letters, not %s' % (v,))
+
+ # TODO: We can validate country_post, country_car, country_isbn.
+ for k in ('country_name', 'country_post', 'country_car', 'country_isbn',
+ 'lang_name'):
+ self.assertDefined(k, lc.fields[k])
+
+ # TODO: We can validate this value more.
+ k = 'country_num'
+ v = lc.fields[k]
+ self.assertNonEmpty(k, v)
+ if v:
+ if isinstance(v, int):
+ v = '%03i' % v
+ self.assertEqual(k, '', re.sub(r'[0-9]', '', v),
+ 'must be 3 numbers, not %s' % (v,))
+ self.assertEqual(k, len(v), 3, 'must be 3 numbers, not %s' % (v,))
+
+
+class CheckLCTelephone(Check):
+ """Check LC_TELEPHONE object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ # XXX: ld-telephone.c is more restrictive.
+ valid_values = 'aAcCelt'
+ for k in ('tel_int_fmt', 'tel_dom_fmt'):
+ v = lc.fields[k]
+ self.assertNonEmpty(k, v)
+ if v:
+ self.assertFormat(k, v, valid_values,
+ 'only %s formats are accepted, not %s' % (valid_values, v))
+
+ for k in ('int_select', 'int_prefix'):
+ self.assertDefined(k, lc.fields[k])
+
+
+class CheckLCMeasurement(Check):
+ """Check LC_MEASUREMENT object for problems."""
+
+ def run(self):
+ lc = self.lc
+
+ k = 'measurement'
+ v = lc.fields[k]
+ valid_values = (
+ '1', # Imperial units.
+ '2', # Metric units.
+ # XXX: Drop this? Need to implement copy directives.
+ None, # Noet set.
+ )
+ self.assertIn(k, v, valid_values, 'should be 1 or 2, not %s' % (v,))
+
+
+def check(loc):
+ """Check locale |loc| object for problems."""
+ ret = True
+ for cat in locales.CATEGORY_ORDER:
+ if cat in loc.categories:
+ # TODO: We should throw an error if |cat| is missing.
+ lc = getattr(loc, cat.lower())
+ checker = getattr(sys.modules[__name__], 'CheckLC%s%s' %
+ (cat[3], cat[4:].lower()), None)
+ check = checker(loc, lc)
+ check.run()
+ if check.failed:
+ ret = False
+ return ret
+
+
+def main(argv):
+ """The main entry point."""
+ parser = get_parser()
+ opts = parser.parse_args(argv)
+
+ # These are not "real" locales, so skip them.
+ SKIP_LOCALES = () #'i18n', 'iso14651', 'translit', 'C', 'POSIX')
+
+ # Process all the locales the user told us to.
+ ret = 0
+ for locale in opts.locales:
+ name = os.path.basename(locale)
+ if name.split('_', 1)[0] in SKIP_LOCALES:
+ continue
+
+ try:
+ loc = locales.Locale(name=name, path=locale)
+ except UnicodeDecodeError:
+ print('%s: bad encodings' % (locale,))
+ subprocess.check_call(['file', locale])
+ except locales.LocaleError as e:
+ print('%s: %s' % (name, e))
+ continue
+ if not check(loc):
+ #print('%s: please correct issues' % name)
+ ret = 1
+ return ret
+
+
+if __name__ == '__main__':
+ exit(main(sys.argv[1:]))
@@ -0,0 +1,1204 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# Written by Mike Frysinger <vapier@gentoo.org> for much great glory.
+#
+# Copyright (C) 2016 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Helper tool for importing current CLDR data.
+
+See http://cldr.unicode.org/ for more details."""
+
+# TODO: Need to handle copy directives better so we can see when a value
+# has changed for a specific locale, but it's copying the (wrong) values
+# from others.
+# TODO: Add missing fields.
+# TODO: Add support for updating locale/iso-3166.def via supplementalData.xml.
+# TODO: Add support for updating locale/iso-4217.def.
+# TODO: In cases where a locale & lang do not exist in the CLDR, we should
+# still be able to update English names in the description and aspects that
+# are territory specific (and lang independent).
+# TODO: To address the previous case, we should split CldrLocale up into a
+# base class and CldrLanguage and CldrTerritory children. Then the CldrLocale
+# object would take care of blending those into its own results.
+# TODO: Add ISBN support: https://www.isbn-international.org/range_file_generation
+
+from __future__ import print_function
+
+import argparse
+import datetime
+import errno
+import logging
+import os
+import re
+import subprocess
+import sys
+import time
+from xml.etree import ElementTree
+
+import locales
+u_encode = locales.u_encode
+u_decode = locales.u_decode
+
+
+# Where to store CLDR/etc... data files we fetch.
+DEFAULT_WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+ 'cldr-%(version)s')
+
+
+def get_parser():
+ """Return an argument parser for this module."""
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('--working-dir', default=DEFAULT_WORKING_DIR,
+ help='Where to download files (default: %(default)s)')
+ parser.add_argument('-v', '--version', default=Cldr.CURR_VERSION,
+ help='Version of CLDR to use (default: %(default)s)')
+ parser.add_argument('locales', nargs='*', help='Locales to generate')
+ return parser
+
+
+def logging_init(debug=False):
+ """Set up the logging module."""
+ fmt = '%(asctime)s: %(levelname)-7s: '
+ fmt += '%(message)s'
+ # 'Sat, 05 Oct 2013 18:58:50 -0400 (EST)'
+ tzname = time.strftime('%Z', time.localtime())
+ datefmt = '%a, %d %b %Y %H:%M:%S ' + tzname
+ level = logging.DEBUG if debug else logging.INFO
+ handler = logging.StreamHandler(stream=sys.stdout)
+ formatter = logging.Formatter(fmt, datefmt)
+
+ handler.setFormatter(formatter)
+
+ logger = logging.getLogger()
+ logger.addHandler(handler)
+ logger.setLevel(level)
+
+
+class cached_property(object): # pylint: disable=invalid-name
+ """Like @property but cached"""
+
+ def __init__(self, func):
+ self.func = func
+
+ def __get__(self, instance, _owner):
+ if instance is None:
+ return self
+ value = instance.__dict__[self.func.__name__] = self.func(instance)
+ return value
+
+
+class Iso639(object):
+ """Content for the ISO-639 database."""
+
+ # Link to upstream ISO-639-2 database.
+ ISO639_2_URI = 'http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt'
+
+ # Path to our local copy of the ISO-639 database.
+ PATH = os.path.join(os.path.dirname(os.path.dirname(
+ os.path.realpath(__file__))), 'locale', 'iso-639.def')
+
+ # Regex to process our local ISO-639 def file.
+ _LINE_MATCH = re.compile(
+ r'^(DEFINE_LANGUAGE_CODE \("([^"]*)", ([^,]*), ([^,]*), ([^,]*)\)'
+ r'|DEFINE_LANGUAGE_CODE3 \("([^"]*)", ([^,]*), ([^,]*)\))$')
+
+ def __init__(self):
+ self.db = {}
+ with open(self.PATH) as fp:
+ for line in fp:
+ m = self._LINE_MATCH.match(line)
+ if m:
+ if m.group(1) is None:
+ # DEFINE_LANGUAGE_CODE3 form.
+ self.db[m.group(6)] = (m.group(5), m.group(7))
+ else:
+ # DEFINE_LANGUAGE_CODE form.
+ self.db[m.group(2)] = (m.group(1), m.group(3),
+ m.group(4))
+
+ def get_term(self, lang):
+ """Return the ISO 639-2/T (Terminology) code."""
+ entry = self.db.get(lang, ())
+ if len(entry) == 3:
+ return entry[1]
+
+ def get_bib(self, lang):
+ """Return the ISO 639-2/B (Bibliographic) code."""
+ entry = self.db.get(lang, ())
+ if len(entry) == 3:
+ return entry[2]
+
+ def _download_uri(self, path):
+ """Download the ISO-639-2 db."""
+ iso639 = os.path.join(path, os.path.basename(self.ISO639_2_URI))
+ if not os.path.exists(iso639):
+ subprocess.check_call(['wget', '-O', iso639, self.ISO639_2_URI])
+ self._load_iso639(iso639)
+
+ @staticmethod
+ def _load_iso639(db):
+ """Load ISO-639-2 database.
+
+ http://www.loc.gov/standards/iso639-2/ascii_8bits.html
+
+ An alpha-3 (bibliographic) code,
+ an alpha-3 (terminologic) code (when given),
+ an alpha-2 code (when given),
+ an English name, and
+ a French name of a language are all separated by pipe (|) characters.
+ """
+ db = {}
+ with open(db) as fp:
+ for line in fp:
+ bcode, tcode, code, _en, _fr = line.rstrip().split('|')
+ if code:
+ db[code] = (bcode, tcode)
+ return db
+
+
+class CarDatabase(object):
+ """Content for international licence plate country code."""
+
+ # Path to our local copy of the database.
+ PATH = os.path.join(os.path.dirname(os.path.dirname(
+ os.path.realpath(__file__))), 'locale', 'car.def')
+
+ def __init__(self):
+ lines = [x.strip() for x in open(self.PATH).readlines() if '|' in x]
+ self.db = dict(x.split('|') for x in lines)
+
+ def get(self, territory):
+ return self.db.get(territory)
+
+
+class CldrLocale(object):
+ """Content for a single locale in the cldr database."""
+
+ _DAY_KEYS = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
+
+ def __init__(self, cldr, locale, iso639, cardb):
+ self._lang = None
+ self._territory = None
+
+ self.cldr = cldr
+ self.locale = locale
+ self.iso639 = iso639
+ self.cardb = cardb
+
+ # Try a few variations to try and find a suitable data source.
+
+ # Try the original locale name.
+ try:
+ self.locale_root = cldr.load_lang(locale.cldr)
+ except OSError as e:
+ if e.errno != errno.ENOENT:
+ raise
+
+ # See if there is a "world" locale for this lang.
+ try:
+ self.locale_root = cldr.load_lang(locale.lang + '_001')
+ # Override the territory though so it isn't "world".
+ self._territory = locale.territory
+ except OSError as e:
+ if e.errno != errno.ENOENT:
+ raise
+
+ # Generate the locale ourselves.
+ self.locale_root = self.generate_locale(locale.lang, locale.territory)
+
+ # We might have languages that are not in CLDR.
+
+ # Try the language w/script name details.
+ try:
+ self.lang_root = cldr.load_lang(locale.cldr_lang)
+ except OSError as e:
+ if e.errno != errno.ENOENT:
+ raise
+
+ # Try the plain language then.
+ try:
+ self.lang_root = cldr.load_lang(locale.cldr_lang)
+ except OSError as e:
+ if e.errno != errno.ENOENT:
+ raise
+
+ # Stub out the lang.
+ #self._lang = locale.lang
+ self.lang_root = None
+
+ @staticmethod
+ def generate_locale(lang, territory):
+ """Generate a simple locale XML for this lang/territory.
+
+ Used when we have a locale that isn't in CLDR, but CLDR does have
+ the lang and we know the territory.
+ """
+ return ElementTree.fromstring(
+ '<ldml><identity>' +
+ ('<language type="%s"/>' % lang) +
+ ('<territory type="%s"/>' % territory) +
+ '</identity></ldml>'
+ )
+
+ @cached_property
+ def lang(self):
+ """The locale's short language code."""
+ root = self.locale_root.find('identity/language')
+ return root.get('type')
+
+ @cached_property
+ def territory(self):
+ """The locale's short territory code."""
+ if self._territory:
+ return self._territory
+ root = self.locale_root.find('identity/territory')
+ return root.get('type')
+
+ @cached_property
+ def en_lang(self):
+ """The name of the language in English."""
+ root = self.cldr.load_lang('en')
+ names = root.find('localeDisplayNames')
+ # First see if the locale has a name before we fall back to the lang.
+ langs_root = names.find('languages')
+ lang_root = langs_root.find('language[@type="%s"]' % self.locale)
+ if lang_root is None:
+ lang_root = langs_root.find('language[@type="%s"]' % self.lang)
+ # The CLDR is missing some languages.
+ if lang_root is None:
+ logging.warning('%s: en_lang: CLDR is missing english name for '
+ 'the language', self.locale)
+ return None
+ return lang_root.findtext('.')
+
+ @cached_property
+ def en_territory(self):
+ """The name of the territory in English."""
+ root = self.cldr.load_lang('en')
+ names = root.find('localeDisplayNames')
+ return names.find('territories/territory[@type="%s"]' %
+ self.territory).findtext('.')
+
+ @cached_property
+ def country_ab2(self):
+ """Two-letter ISO-3166 country code."""
+ # TODO: Implement this.
+
+ @cached_property
+ def country_ab3(self):
+ """Three-letter ISO-3166 country code."""
+ # TODO: Implement this.
+
+ @cached_property
+ def lang_name(self):
+ """The localized name for the language."""
+ for root in (self.locale_root, self.lang_root):
+ if root is None:
+ continue
+
+ names = root.find('localeDisplayNames')
+ if names is not None:
+ langs_root = names.find('languages')
+ if langs_root is not None:
+ lang_root = langs_root.find('language[@type="%s"]' %
+ self.lang)
+ if lang_root is not None:
+ return lang_root.findtext('.')
+
+ @cached_property
+ def unicode_language_subtag(self):
+ """Two-letter ISO 639-1 code"""
+ #root = self.cldr.load_supp('supplementalMetadata')
+ #alias = root.find('metadata/alias/languageAlias[@type="%s"]' % self.lang)
+ return self.lang if len(self.lang) == 2 else ''
+
+ @cached_property
+ def lang_term(self):
+ """Three-letter ISO 639-2/T (Terminology) code"""
+ return self.iso639.get_term(self.lang)
+
+ @cached_property
+ def lang_lib(self):
+ """Three-letter ISO 639-2/B (Bibliographic) code"""
+ return self.iso639.get_bib(self.lang)
+
+ @cached_property
+ def country_name(self):
+ """The localiezd name for the territory."""
+ for root in (self.locale_root, self.lang_root):
+ if root is None:
+ continue
+
+ names = root.find('localeDisplayNames')
+ if names is not None:
+ name = names.find('territories/territory[@type="%s"]' %
+ self.territory)
+ if name is not None:
+ return name.findtext('.')
+
+ @cached_property
+ def country_num(self):
+ """ISO 3166-1 numeric code."""
+ root = self.cldr.load_supp('supplementalData')
+ codes = root.find('codeMappings/territoryCodes[@type="%s"]' %
+ self.territory)
+ return int(codes.get('numeric'))
+
+ @cached_property
+ def country_car(self):
+ """International licence plate country code."""
+ return self.cardb.get(self.territory)
+
+ @cached_property
+ def country_term(self):
+ """ISO 3166-1 alpha-3 code"""
+ root = self.cldr.load_supp('supplementalData')
+ codes = root.find('codeMappings/territoryCodes[@type="%s"]' %
+ self.territory)
+ return codes.get('alpha3').lower()
+
+ @cached_property
+ def tel_int_fmt(self):
+ """Telephone format for international calling."""
+ # TODO: Implement this.
+
+ @cached_property
+ def tel_dom_fmt(self):
+ """Telephone format for domestic calling."""
+ # TODO: Implement this.
+
+ @cached_property
+ def int_select(self):
+ """Telephone prefix for calling international numbers."""
+ # TODO: Implement this.
+
+ @cached_property
+ def int_prefix(self):
+ """Telephone international country code prefix."""
+ root = self.cldr.load_supp('telephoneCodeData')
+ code = root.find('telephoneCodeData/codesByTerritory[@territory="%s"]'
+ '/telephoneCountryCode' % self.territory)
+ # The CLDR is missing some territories.
+ if code is None:
+ logging.warning('%s: int_prefix: CLDR is missing country code; '
+ 'try https://countrycode.org/%s',
+ self.locale, self.territory)
+ return None
+ return code.get('code')
+
+ @cached_property
+ def int_curr_symbol(self):
+ """Need to rectify w/locale/iso-4217.def."""
+ # The xmlpath support in python is not complete, so we need to search
+ # for the currency w/missing @to attribute ourselves.
+ root = self.cldr.load_supp('supplementalData')
+ currencies = root.find('currencyData/region[@iso3166="%s"]' %
+ self.territory)
+ for currency in currencies.getchildren():
+ if 'to' not in currency.keys():
+ return currency.get('iso4217')
+
+ raise ValueError('Could not find a currency for %s' % (self.territory,))
+
+ @cached_property
+ def currency_symbol(self):
+ """Need to rectify w/locale/iso-4217.def."""
+ def filter_markers(sym):
+ """Strip out some content we don't care about like the RTL marker."""
+ return sym.replace(u'\u200f', '')
+
+ # First search the locale, then the lang dbs.
+ for root in (self.locale_root, self.lang_root):
+ if root is None:
+ continue
+
+ numbers_root = root.find('numbers')
+ if numbers_root is None:
+ continue
+ symbol_ele = numbers_root.find('currencies/currency[@type="%s"]'
+ '/symbol' % self.int_curr_symbol)
+ if symbol_ele is not None:
+ return filter_markers(symbol_ele.findtext('.'))
+
+ # Try the common currency database.
+ chars_root = self.cldr.load_supp('characters')
+ for symbol_ele in chars_root.find('characters'
+ '/character-fallback').getchildren():
+ if symbol_ele.findtext('substitute') == self.int_curr_symbol:
+ return filter_markers(symbol_ele.get('value'))
+
+ # A few symbols have no translation.
+ return None #self.int_curr_symbol
+
+ @cached_property
+ def number_system(self):
+ """Get the active number system for this locale."""
+ for root in (self.locale_root, self.lang_root):
+ if root is None:
+ continue
+
+ numbers_root = root.find('numbers')
+ if numbers_root is None:
+ continue
+
+ # If there's a default labeled, use it. Otherwise just go with
+ # the first one found. It should be the only one.
+ num_sys_ele = root.find('defaultNumberingSystem')
+ if num_sys_ele is None:
+ return numbers_root.find('symbols')
+ else:
+ return numbers_root.find('symbols[@numberSystem="%s"]' %
+ num_sys_ele.findtext('.'))
+
+ @cached_property
+ def decimal_point(self):
+ """The symbol used to denote decimal points."""
+ num_symbols_root = self.number_system
+ try:
+ return num_symbols_root.find('decimal').findtext('.')
+ except AttributeError:
+ return None
+
+ @cached_property
+ def thousands_sep(self):
+ """The symbol used to group thousands digits."""
+ num_symbols_root = self.number_system
+ try:
+ return num_symbols_root.find('group').findtext('.')
+ except AttributeError:
+ return None
+
+ @cached_property
+ def grouping(self):
+ # TODO: Implement this.
+ pass
+
+ def _lookup_day_mon(self, cal_field, cal_type, cal_idxs):
+ """Look up various calendar fields."""
+ for root in (self.locale_root, self.lang_root):
+ if root is None:
+ continue
+
+ dates_root = root.find('dates')
+ if dates_root is None:
+ continue
+ calendars_root = dates_root.find('calendars')
+ if calendars_root is None:
+ continue
+ # XXX: Look up type in calendarPreference ?
+ calendar_root = calendars_root.find('calendar[@type="gregorian"]')
+ if calendar_root is None:
+ continue
+
+ dm_root = None
+ for key in ('stand-alone', 'format', 'narrow'):
+ ctx_root = calendar_root.find('%ss/%sContext[@type="%s"]' %
+ (cal_field, cal_field, key))
+ if ctx_root is None:
+ continue
+ dm_root = ctx_root.find('%sWidth[@type="%s"]' %
+ (cal_field, cal_type))
+ if dm_root is None:
+ continue
+
+ ret = [dm_root.find('%s[@type="%s"]' % (cal_field, x))
+ for x in cal_idxs]
+ if None not in ret:
+ return [x.findtext('.') for x in ret]
+
+ def _lookup_day(self, width_type):
+ """Internal helper for abday/day lookups."""
+ return self._lookup_day_mon('day', width_type, self._DAY_KEYS)
+
+ def _lookup_mon(self, width_type):
+ """Internal helper for abmon/mon lookups."""
+ return self._lookup_day_mon('month', width_type, range(1, 13))
+
+ @cached_property
+ def abday(self):
+ """Abbreviated localized names for the days of the week."""
+ return self._lookup_day('abbreviated')
+
+ @cached_property
+ def day(self):
+ """Full localized names for the days of the week."""
+ return self._lookup_day('wide')
+
+ @cached_property
+ def abmon(self):
+ """Abbreviated localized names for the months."""
+ return self._lookup_mon('abbreviated')
+
+ @cached_property
+ def mon(self):
+ """Full localized names for the months."""
+ return self._lookup_mon('wide')
+
+ # http://www.unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns
+ _CLDR_TO_POSIX_FMT = {
+ # year
+ 'y': '%%-y',
+ 'yy': '%%y',
+ 'yyy': '%%-Y',
+ 'yyyy': '%%Y',
+ # month
+ 'M': '%%-m',
+ 'MM': '%%m',
+ 'MMM': '%%b',
+ 'MMMM': '%%B',
+ # day
+ 'd': '%%-d',
+ 'dd': '%%d',
+ # period
+ 'a': '%%p',
+ # hour
+ 'h': '%%-I',
+ 'hh': '%%I',
+ 'H': '%%-H',
+ 'HH': '%%H',
+ # minute
+ 'm': '%%-M',
+ 'mm': '%%M',
+ # second
+ 's': '%%-S',
+ 'ss': '%%S',
+ }
+
+ @classmethod
+ def _to_posix_fmt(cls, fmt):
+ """Convert the CLDR notation to what POSIX uses."""
+ lookup = lambda m: cls._CLDR_TO_POSIX_FMT[m.group(1)]
+ return re.sub(r'\b(' + '|'.join(cls._CLDR_TO_POSIX_FMT.keys()) + r')\b',
+ lookup, fmt)
+
+ @cached_property
+ def hours_format(self):
+ """Return 24 or 12 depending on preferred %H or %h format"""
+ root = self.cldr.load_supp('supplementalData')
+ datasets = root.find('timeData')
+ pref = None
+ for dataset in datasets.findall('hours'):
+ territories = dataset.get('regions')
+ value = dataset.get('preferred')
+
+ # TODO: Make this walk logic more robust/common.
+ territories = territories.split()
+ if '001' in territories:
+ if pref is None:
+ # The allowed field makes this tricky.
+ #pref = value
+ pass
+ if self.territory in territories:
+ pref = value
+
+ if pref == 'H':
+ return '24'
+ elif pref == 'h':
+ return '12'
+ elif pref is None:
+ return None
+ else:
+ raise ValueError('Unknown hour value: %s' % pref)
+
+ @cached_property
+ def am_pm(self):
+ """Localized AM/PM time fields when 12 hour clocks are used."""
+ if self.hours_format == '24':
+ return ['', '']
+ elif self.hours_format is None:
+ return None
+
+ return self._lookup_day_mon('dayPeriod', 'abbreviated', ('am', 'pm'))
+
+ def _lookup_d_t_fmt(self, dt, dt_type='medium'):
+ """Internal helper for various fmt lookups."""
+ for root in (self.locale_root, self.lang_root):
+ if root is None:
+ continue
+
+ dates_root = root.find('dates')
+ if dates_root is None:
+ continue
+ calendars_root = dates_root.find('calendars')
+ # XXX: Look up type in calendarPreference ?
+ calendar_root = calendars_root.find('calendar[@type="gregorian"]')
+
+ fmts = calendar_root.find('%sFormats/%sFormatLength[@type="%s"]'
+ '/%sFormat/pattern' %
+ (dt, dt, dt_type, dt))
+ if fmts is not None:
+ return fmts.findtext('.')
+
+ @cached_property
+ def d_t_fmt(self):
+ """Appropriate date and time representation (%c)
+
+ Example:
+ $ date +'%a %d %b %Y %r %Z'
+ Tue 09 Feb 2016 06:39:48 PM EST
+ """
+ return self._to_posix_fmt(
+ self._lookup_d_t_fmt('dateTime').replace(
+ '{0}', self._t_fmt).replace(
+ '{1}', self._d_fmt))
+
+ @cached_property
+ def _d_fmt(self):
+ """Internal helper for the raw d_fmt field."""
+ return self._lookup_d_t_fmt('date')
+
+ @cached_property
+ def d_fmt(self):
+ """Appropriate date representation (%x)
+
+ Example:
+ $ date +'%m/%d/%Y'
+ 02/09/2016
+ """
+ return self._to_posix_fmt(self._d_fmt)
+
+ @cached_property
+ def _t_fmt(self):
+ """Internal helper for the raw t_fmt field."""
+ return self._lookup_d_t_fmt('time')
+
+ @cached_property
+ def t_fmt(self):
+ """Appropriate time representation (%X)
+
+ Example:
+ $ date +%r
+ 06:41:21 PM
+ """
+ return self._to_posix_fmt(self._t_fmt)
+
+ @cached_property
+ def t_fmt_ampm(self):
+ """Appropriate AM/PM time representation (%r)
+
+ Example:
+ $ date +'%I:%M:%S %p'
+ 06:41:21 PM
+ """
+ if self.hours_format == '24':
+ return ''
+ elif self.hours_format is None:
+ return None
+
+ return None
+
+ @cached_property
+ def date_fmt(self):
+ """Appropriate date representation (date(1))
+
+ $ date +'%a %b %e %H:%M:%S %Z %Y'
+ Tue Feb 9 06:39:48 EST 2016
+ """
+ pass
+
+ @cached_property
+ def week(self):
+ """DAYSINWEEK;WEEKSTARTDATE;MINWEEKLEN field"""
+ root = self.cldr.load_supp('supplementalData')
+ data = root.find('weekData')
+ ret = None
+ for start in data.findall('minDays'):
+ territories = start.get('territories')
+ value = start.get('count')
+
+ # TODO: Make this walk logic more robust/common.
+ territories = territories.split()
+ if '001' in territories:
+ if ret is None:
+ ret = value
+ if self.territory in territories:
+ ret = value
+
+ # Just hardcode this as no one changes it.
+ daysinweek = 7
+
+ # Hardcode this as well as there's no advantage to it otherwise.
+ # It's also what CLDR bases things on.
+ weekstartdate = 19971130
+
+ minweeklen = int(ret)
+
+ return (daysinweek, weekstartdate, minweeklen)
+
+ @cached_property
+ def first_weekday(self):
+ """Number of day in the week for the first column in the calendar.
+
+ Sunday = 1, Monday = 2, ...
+ """
+ root = self.cldr.load_supp('supplementalData')
+ data = root.find('weekData')
+ first = None
+ for start in data.findall('firstDay'):
+ territories = start.get('territories')
+ day = start.get('day')
+
+ # Throw out ones we don't care about.
+ if start.get('alt') is not None:
+ continue
+
+ # TODO: Make this walk logic more robust/common.
+ territories = territories.split()
+ if '001' in territories:
+ if first is None:
+ first = day
+ if self.territory in territories:
+ first = day
+
+ # We add +1 for index->day-of-week adjustment,
+ return self._DAY_KEYS.index(first) + 1
+
+ @cached_property
+ def first_workday(self):
+ """Number of day in the week for the first working day.
+
+ Sunday = 1, Monday = 2, ...
+ """
+ root = self.cldr.load_supp('supplementalData')
+ data = root.find('weekData')
+ first = None
+ for start in data.findall('weekendEnd'):
+ territories = start.get('territories')
+ day = start.get('day')
+
+ # TODO: Make this walk logic more robust/common.
+ territories = territories.split()
+ if '001' in territories:
+ if first is None:
+ first = day
+ if self.territory in territories:
+ first = day
+
+ # We add +1 for index->day-of-week adjustment,
+ # and we add +1 for weekendEnd->workdayStart.
+ # We do the % to handle sat->sun wrapping.
+ return ((self._DAY_KEYS.index(first) + 1) % 7) + 1
+
+ @cached_property
+ def measurement(self):
+ """Return 1 for metric and 2 for imperial"""
+ root = self.cldr.load_supp('supplementalData')
+ measurement = None
+ for system in root.findall('measurementData/measurementSystem'):
+ territories = system.get('territories')
+ stype = system.get('type')
+
+ # Throw out ones we don't care about.
+ if system.get('category') == 'temperature' or stype == 'UK':
+ continue
+
+ # TODO: Make this walk logic more robust/common.
+ territories = territories.split()
+ if '001' in territories:
+ if measurement is None:
+ measurement = stype
+ if self.territory in territories:
+ measurement = stype
+
+ # We don't use imperial settings for Myanmar even though CLDR does.
+ # https://en.wikipedia.org/wiki/Myanmar_units_of_measurement
+ if self.territory == 'MM':
+ if measurement == 'US':
+ measurement = 'metric'
+ else:
+ raise ValueError('CLDR is updated; drop this hack')
+
+ if measurement == 'metric':
+ return 1
+ elif measurement == 'US':
+ return 2
+ else:
+ raise ValueError('Do not understand type %s' % measurement)
+
+ @cached_property
+ def measurement_copy(self):
+ """We copy other locales for most"""
+ if self.locale in ('en_US', 'i18n'):
+ return None
+ elif self.measurement == 1:
+ return 'i18n'
+ elif self.measurement == 2:
+ return 'en_US'
+ else:
+ raise ValueError('Unknown measurement %s' % self.measurement)
+
+ @cached_property
+ def paper(self):
+ """Return the paper type"""
+ root = self.cldr.load_supp('supplementalData')
+ paper = None
+ for system in root.findall('measurementData/paperSize'):
+ territories = system.get('territories')
+ stype = system.get('type')
+
+ # TODO: Make this walk logic more robust/common.
+ territories = territories.split()
+ if '001' in territories:
+ if paper is None:
+ paper = stype
+ if self.territory in territories:
+ paper = stype
+
+ return paper
+
+ @cached_property
+ def paper_height(self):
+ """Return the height of paper (in mm)"""
+ return {'A4': 297, 'US-Letter': 279}.get(self.paper)
+
+ @cached_property
+ def paper_width(self):
+ """Return the width of paper (in mm)"""
+ return {'A4': 210, 'US-Letter': 216}.get(self.paper)
+
+ @cached_property
+ def paper_copy(self):
+ """We copy other locales for most"""
+ if self.locale in ('en_US', 'i18n'):
+ return None
+ elif self.paper == 'A4':
+ return 'i18n'
+ elif self.paper == 'US-Letter':
+ return 'en_US'
+ else:
+ raise ValueError('Unknown paper %s' % self.paper)
+
+
+class Cldr(object):
+ """Content for the cldr database."""
+
+ # The current release version that we use.
+ CURR_VERSION = '29'
+
+ # Where to find the CLDR data.
+ URI = 'http://unicode.org/Public/cldr/%(version)s/core.zip'
+
+ def __init__(self, path, version):
+ fields = {'version': version}
+ self.dir = path % fields
+ self.uri = self.URI % fields
+ self.version = version
+ self.date = None
+ self.main_dbs = {}
+ self.supp_dbs = {}
+ self.iso639 = Iso639()
+ self.cardb = CarDatabase()
+
+ # Set up the working dir.
+ if not os.path.exists(self.dir):
+ os.makedirs(self.dir)
+
+ def download(self):
+ """Download the current cldr database."""
+ # Download the CLDR data.
+ archive = os.path.join(self.dir, 'core.zip')
+ if not os.path.exists(archive):
+ subprocess.check_call(['wget', '-O', archive, self.uri])
+ self.date = datetime.datetime.fromtimestamp(os.path.getmtime(archive))
+
+ # Unpack the CLDR data.
+ common_dir = os.path.join(self.dir, 'common')
+ if not os.path.exists(common_dir):
+ subprocess.check_call(['unzip', '-u', 'core.zip'], cwd=self.dir)
+
+ def _load_db(self, db, subdir, cache):
+ """Load the database |db| out of |subdir| using |cache|."""
+ if db not in cache:
+ db_path = os.path.join(self.dir, 'common', subdir, '%s.xml' % db)
+ tree = ElementTree.parse(db_path)
+ cache[db] = tree.getroot()
+ return cache[db]
+
+ def _load_main(self, db):
+ """Load database |db| from the main repo."""
+ return self._load_db(db, 'main', self.main_dbs)
+
+ def load_lang(self, lang):
+ """Load the language |lang| database."""
+ return self._load_main(lang)
+
+ def load_supp(self, db):
+ """Load database |db| from the supplemental repo."""
+ return self._load_db(db, 'supplemental', self.supp_dbs)
+
+ def locale(self, locale):
+ """Get an object for a specific cldr |locale|."""
+ return CldrLocale(self, locale, self.iso639, self.cardb)
+
+
+class Locale(locales.Locale):
+ """An updated locale datafile."""
+
+ def update_cldr(self, cldr):
+ """Merge CLDR updates in to this locale."""
+ cldr_locale = cldr.locale(self.locale)
+ if cldr_locale is None:
+ logging.warning('%s: no CLDR entry found for %s',
+ self.name, self.locale)
+ return
+
+ # Start updating the actual data.
+ cldr_values = {
+ 'generator': os.path.basename(__file__),
+ 'english_territory_name': cldr_locale.en_territory,
+ 'source_name': 'Unicode Common Locale Data Repository (CLDR)',
+ 'source_version': cldr.version,
+ 'source_uri': cldr.uri.replace('/', '//'),
+ 'source_date': cldr.date.strftime('%Y-%m-%d'),
+ 'lang': cldr_locale.lang,
+ 'territory': cldr_locale.territory,
+ 'locale': cldr_locale.locale,
+ }
+ if cldr_locale.en_lang:
+ cldr_values.update({
+ 'english_lang_name': u_decode(cldr_locale.en_lang),
+ })
+
+ all_values = {}
+ all_values['LC_IDENTIFICATION'] = {
+ #'source': 'Based on %(source_name)s',
+ #'address': '%(source_uri)s',
+ #'contact': 'http:////cldr.unicode.org//index//process',
+ #'email': 'bug-glibc-locales@gnu.org',
+ 'tel': '',
+ 'fax': '',
+ 'territory': '%(english_territory_name)s',
+ #'revision': '%(source_version)s',
+ #'date': '%(source_date)s',
+ }
+ if cldr_locale.en_lang:
+ all_values['LC_IDENTIFICATION'].update({
+ 'title': ('%(english_lang_name)s language locale for '
+ '%(english_territory_name)s'),
+ 'language': '%(english_lang_name)s',
+ })
+
+ # These are based on the charset, not the locale.
+ all_values['LC_CTYPE'] = {}
+ all_values['LC_COLLATE'] = {}
+ all_values['LC_TIME'] = {
+ #'abday': cldr_locale.abday,
+ #'day': cldr_locale.day,
+ #'abmon': cldr_locale.abmon,
+ #'mon': cldr_locale.mon,
+ #'am_pm': cldr_locale.am_pm,
+ #'d_t_fmt': cldr_locale.d_t_fmt,
+ #'d_fmt': cldr_locale.d_fmt,
+ #'t_fmt': cldr_locale.t_fmt,
+ #'t_fmt_ampm': cldr_locale.t_fmt_ampm,
+ #'date_fmt': cldr_locale.date_fmt,
+ #'week': cldr_locale.week,
+ #'first_weekday': cldr_locale.first_weekday,
+ #'first_workday': cldr_locale.first_workday,
+ }
+ all_values['LC_NUMERIC'] = {
+ #'decimal_point': cldr_locale.decimal_point,
+ #'thousands_sep': cldr_locale.thousands_sep,
+ #'grouping': cldr_locale.grouping,
+ }
+ all_values['LC_MONETARY'] = {
+ 'int_curr_symbol': cldr_locale.int_curr_symbol + ' ',
+ 'currency_symbol': cldr_locale.currency_symbol,
+ }
+ # See lang/posix/messages/{yes,no}str.
+ all_values['LC_MESSAGES'] = {
+ #'yesexpr': cldr_locale.yesexpr,
+ #'noexpr': cldr_locale.noexpr,
+ }
+ all_values['LC_PAPER'] = {
+ 'height': cldr_locale.paper_height,
+ 'width': cldr_locale.paper_width,
+ #'copy': cldr_locale.paper_copy,
+ }
+ # XXX: Need a data source for this.
+ all_values['LC_NAME'] = {
+ }
+ all_values['LC_ADDRESS'] = {
+ #'postal_fmt':
+ 'country_name': cldr_locale.country_name,
+ #'country_post':
+ 'country_ab2': cldr_locale.country_ab2,
+ 'country_ab3': cldr_locale.country_ab3,
+ 'country_num': cldr_locale.country_num,
+ 'country_car': cldr_locale.country_car,
+ #'country_isbn':
+ 'lang_name': cldr_locale.lang_name,
+ 'lang_ab': cldr_locale.unicode_language_subtag,
+ 'lang_term': cldr_locale.lang_term,
+ 'lang_lib': cldr_locale.lang_lib,
+ }
+ all_values['LC_TELEPHONE'] = {
+ #'tel_int_fmt': cldr_locale.tel_int_fmt,
+ #'tel_dom_fmt': cldr_locale.tel_dom_fmt,
+ #'int_select': cldr_locale.int_select,
+ }
+ if cldr_locale.int_prefix:
+ all_values['LC_TELEPHONE'].update({
+ 'int_prefix': cldr_locale.int_prefix,
+ })
+ all_values['LC_MEASUREMENT'] = {
+ 'measurement': cldr_locale.measurement,
+ #'copy': cldr_locale.measurement_copy,
+ }
+
+ a = str(cldr_locale.lang)
+ b = self.lc_address.fields['lang_ab']
+ if b and a != b:
+ print('%s: mismatch: %s %s' % (self.name, a, b))
+
+ # Walk all the categories.
+ for category in self.categories:
+ lc = getattr(self, category.lower())
+ values = all_values[category]
+ if not values:
+ continue
+
+ # Walk each line in this locale category.
+ start_of_line = None
+ full_line = ''
+ i = 0
+ seen_keys = set()
+ while i < len(lc.content):
+ line = lc.content[i]
+ if not line:
+ i += 1
+ continue
+
+ # If the line ends with an escape it is wrapped, so unwrap it
+ # before we check for updates to the value.
+ if (not line.startswith(self.comment_char) and
+ line.endswith(self.escape_char)):
+ if not full_line:
+ start_of_line = i
+ full_line += line[:-1].lstrip()
+ i += 1
+ continue
+ elif full_line:
+ line = full_line + line.lstrip()
+ full_line = ''
+ else:
+ start_of_line = None
+
+ # Process this line.
+ key = line.split()[0]
+ new_value = values.get(key)
+ seen_keys.add(key)
+ if new_value is not None:
+ is_int = isinstance(new_value, int)
+ is_list = isinstance(new_value, (tuple, list, set))
+ if not is_int and is_list:
+ is_int = isinstance(new_value[0], int)
+ if is_int:
+ if is_list:
+ new_value = ';'.join(str(x) for x in new_value)
+ else:
+ new_value = str(new_value)
+ m = re.match(r'\s*(.*?)\s+([0-9;]+)$', line)
+ else:
+ if is_list:
+ new_value = '";"'.join(u_encode(x % cldr_values)
+ for x in new_value)
+ elif key != 'copy':
+ new_value %= cldr_values
+ if category != 'LC_IDENTIFICATION':
+ new_value = u_encode(new_value)
+ m = re.match(r'\s*([^"]*)"(.*)"$', line)
+
+ # We should standardize case at some point.
+ if m and new_value.lower() != m.group(2).lower():
+ disp_key = ('%s:%s' % (category.upper(), key)
+ if key == 'copy' else key)
+ logging.info('%s: %s: changing {%s} to {%s}',
+ self.name, disp_key,
+ u_decode(m.group(2)),
+ u_decode(new_value))
+ leading_line = m.group(1)
+
+ # This is tricky as we have to delete most of the
+ # multiline, then update the one remaining.
+ if start_of_line is not None:
+ #for _ in range(start_of_line, i):
+ # lc.content.pop(start_of_line)
+ del lc.content[start_of_line:i]
+ i = start_of_line
+ if '";"' in new_value:
+ leading_line = leading_line.rstrip() + '\t'
+ num_tabs = (len(leading_line) // 8) + 1
+ new_value = new_value.replace(
+ '";"',
+ '";/\n' + ('\t' * num_tabs) + '"')
+
+ # Finally deploy the updated line.
+ fmt = '%s %s' if is_int else '%s"%s"'
+ lc.content[i] = fmt % (leading_line, new_value)
+
+ i += 1
+
+ missing_keys = set(values.keys()) - seen_keys
+ for key in missing_keys:
+ # TODO: Merge with the logic above.
+ new_value = str(values[key])
+ old_value = str(lc.fields.get(key))
+ if new_value is not None and new_value != old_value:
+ logging.info('%s: %s: changing {%s} to {%s}',
+ self.name, key, old_value, new_value)
+ lc.content.append('%s "%s"' % (key, new_value))
+
+
+def main(argv):
+ """The main entry point."""
+ parser = get_parser()
+ opts = parser.parse_args(argv)
+ logging_init(opts)
+
+ # Get a handle to the cldr database.
+ cldr = Cldr(opts.working_dir, opts.version)
+ cldr.download()
+
+ # These are not "real" locales, so skip them.
+ SKIP_LOCALES = ('i18n', 'iso14651', 'translit', 'C', 'POSIX')
+
+ # Process all the locales the user told us to.
+ for locale in opts.locales:
+ name = os.path.basename(locale)
+ if name.split('_', 1)[0] in SKIP_LOCALES:
+ continue
+
+ logging.info('Updating %s', locale)
+ try:
+ loc = Locale(name=name, path=locale)
+ try:
+ loc.update_cldr(cldr)
+ except Exception:
+ logging.error('%s: updating failed', locale, exc_info=True)
+ loc.write(locale + '.new')
+ os.rename(locale + '.new', locale)
+ except UnicodeDecodeError:
+ logging.error('%s: bad encodings', locale, exc_info=True)
+ subprocess.check_call(['file', locale])
+ except (IndexError, locales.LocaleError):
+ logging.error('%s: loading failed', locale, exc_info=True)
+
+
+if __name__ == '__main__':
+ exit(main(sys.argv[1:]))
@@ -0,0 +1,69 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# Written by Mike Frysinger <vapier@gentoo.org> for much great glory.
+#
+# Copyright (C) 2016 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Simple script to quickly make locale files readable.
+
+Example: %(prog)s locales/en_US | less
+"""
+
+from __future__ import print_function
+
+import argparse
+import sys
+
+import locales
+
+
+def process(_opts, fp):
+ for line in fp:
+ try:
+ line = locales.u_decode(line)
+ except ValueError as e:
+ # Python's chr() does not support the full UTF-8 codepoint
+ # range. Just use the line as-is if it fails.
+ print('FILTER ERROR: %s' % e)
+ print(line, end='')
+
+
+def process_path(opts, path):
+ return process(opts, open(path))
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument('files', nargs='*')
+ return parser
+
+
+def main(argv):
+ parser = get_parser()
+ opts = parser.parse_args(argv)
+ if not opts.files:
+ process(opts, sys.stdin)
+ else:
+ for f in opts.files:
+ process_path(opts, f)
+
+
+if __name__ == '__main__':
+ exit(main(sys.argv[1:]))