Back to index

python3.2  3.2.2
gettext.py
Go to the documentation of this file.
00001 """Internationalization and localization support.
00002 
00003 This module provides internationalization (I18N) and localization (L10N)
00004 support for your Python programs by providing an interface to the GNU gettext
00005 message catalog library.
00006 
00007 I18N refers to the operation by which a program is made aware of multiple
00008 languages.  L10N refers to the adaptation of your program, once
00009 internationalized, to the local language and cultural habits.
00010 
00011 """
00012 
00013 # This module represents the integration of work, contributions, feedback, and
00014 # suggestions from the following people:
00015 #
00016 # Martin von Loewis, who wrote the initial implementation of the underlying
00017 # C-based libintlmodule (later renamed _gettext), along with a skeletal
00018 # gettext.py implementation.
00019 #
00020 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
00021 # which also included a pure-Python implementation to read .mo files if
00022 # intlmodule wasn't available.
00023 #
00024 # James Henstridge, who also wrote a gettext.py module, which has some
00025 # interesting, but currently unsupported experimental features: the notion of
00026 # a Catalog class and instances, and the ability to add to a catalog file via
00027 # a Python API.
00028 #
00029 # Barry Warsaw integrated these modules, wrote the .install() API and code,
00030 # and conformed all C and Python code to Python's coding standards.
00031 #
00032 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
00033 # module.
00034 #
00035 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
00036 #
00037 # TODO:
00038 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
00039 #   memory, but that's probably bad for large translated programs.  Instead,
00040 #   the lexical sort of original strings in GNU .mo files should be exploited
00041 #   to do binary searches and lazy initializations.  Or you might want to use
00042 #   the undocumented double-hash algorithm for .mo files with hash tables, but
00043 #   you'll need to study the GNU gettext code to do this.
00044 #
00045 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
00046 #   find this format documented anywhere.
00047 
00048 
00049 import locale, copy, io, os, re, struct, sys
00050 from errno import ENOENT
00051 
00052 
00053 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
00054            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
00055            'dgettext', 'dngettext', 'gettext', 'ngettext',
00056            ]
00057 
00058 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
00059 
00060 
00061 def c2py(plural):
00062     """Gets a C expression as used in PO files for plural forms and returns a
00063     Python lambda function that implements an equivalent expression.
00064     """
00065     # Security check, allow only the "n" identifier
00066     import token, tokenize
00067     tokens = tokenize.generate_tokens(io.StringIO(plural).readline)
00068     try:
00069         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
00070     except tokenize.TokenError:
00071         raise ValueError('plural forms expression error, maybe unbalanced parenthesis')
00072     else:
00073         if danger:
00074             raise ValueError('plural forms expression could be dangerous')
00075 
00076     # Replace some C operators by their Python equivalents
00077     plural = plural.replace('&&', ' and ')
00078     plural = plural.replace('||', ' or ')
00079 
00080     expr = re.compile(r'\!([^=])')
00081     plural = expr.sub(' not \\1', plural)
00082 
00083     # Regular expression and replacement function used to transform
00084     # "a?b:c" to "b if a else c".
00085     expr = re.compile(r'(.*?)\?(.*?):(.*)')
00086     def repl(x):
00087         return "(%s if %s else %s)" % (x.group(2), x.group(1),
00088                                        expr.sub(repl, x.group(3)))
00089 
00090     # Code to transform the plural expression, taking care of parentheses
00091     stack = ['']
00092     for c in plural:
00093         if c == '(':
00094             stack.append('')
00095         elif c == ')':
00096             if len(stack) == 1:
00097                 # Actually, we never reach this code, because unbalanced
00098                 # parentheses get caught in the security check at the
00099                 # beginning.
00100                 raise ValueError('unbalanced parenthesis in plural form')
00101             s = expr.sub(repl, stack.pop())
00102             stack[-1] += '(%s)' % s
00103         else:
00104             stack[-1] += c
00105     plural = expr.sub(repl, stack.pop())
00106 
00107     return eval('lambda n: int(%s)' % plural)
00108 
00109 
00110 
00111 def _expand_lang(loc):
00112     loc = locale.normalize(loc)
00113     COMPONENT_CODESET   = 1 << 0
00114     COMPONENT_TERRITORY = 1 << 1
00115     COMPONENT_MODIFIER  = 1 << 2
00116     # split up the locale into its base components
00117     mask = 0
00118     pos = loc.find('@')
00119     if pos >= 0:
00120         modifier = loc[pos:]
00121         loc = loc[:pos]
00122         mask |= COMPONENT_MODIFIER
00123     else:
00124         modifier = ''
00125     pos = loc.find('.')
00126     if pos >= 0:
00127         codeset = loc[pos:]
00128         loc = loc[:pos]
00129         mask |= COMPONENT_CODESET
00130     else:
00131         codeset = ''
00132     pos = loc.find('_')
00133     if pos >= 0:
00134         territory = loc[pos:]
00135         loc = loc[:pos]
00136         mask |= COMPONENT_TERRITORY
00137     else:
00138         territory = ''
00139     language = loc
00140     ret = []
00141     for i in range(mask+1):
00142         if not (i & ~mask):  # if all components for this combo exist ...
00143             val = language
00144             if i & COMPONENT_TERRITORY: val += territory
00145             if i & COMPONENT_CODESET:   val += codeset
00146             if i & COMPONENT_MODIFIER:  val += modifier
00147             ret.append(val)
00148     ret.reverse()
00149     return ret
00150 
00151 
00152 
00153 class NullTranslations:
00154     def __init__(self, fp=None):
00155         self._info = {}
00156         self._charset = None
00157         self._output_charset = None
00158         self._fallback = None
00159         if fp is not None:
00160             self._parse(fp)
00161 
00162     def _parse(self, fp):
00163         pass
00164 
00165     def add_fallback(self, fallback):
00166         if self._fallback:
00167             self._fallback.add_fallback(fallback)
00168         else:
00169             self._fallback = fallback
00170 
00171     def gettext(self, message):
00172         if self._fallback:
00173             return self._fallback.gettext(message)
00174         return message
00175 
00176     def lgettext(self, message):
00177         if self._fallback:
00178             return self._fallback.lgettext(message)
00179         return message
00180 
00181     def ngettext(self, msgid1, msgid2, n):
00182         if self._fallback:
00183             return self._fallback.ngettext(msgid1, msgid2, n)
00184         if n == 1:
00185             return msgid1
00186         else:
00187             return msgid2
00188 
00189     def lngettext(self, msgid1, msgid2, n):
00190         if self._fallback:
00191             return self._fallback.lngettext(msgid1, msgid2, n)
00192         if n == 1:
00193             return msgid1
00194         else:
00195             return msgid2
00196 
00197     def info(self):
00198         return self._info
00199 
00200     def charset(self):
00201         return self._charset
00202 
00203     def output_charset(self):
00204         return self._output_charset
00205 
00206     def set_output_charset(self, charset):
00207         self._output_charset = charset
00208 
00209     def install(self, names=None):
00210         import builtins
00211         builtins.__dict__['_'] = self.gettext
00212         if hasattr(names, "__contains__"):
00213             if "gettext" in names:
00214                 builtins.__dict__['gettext'] = builtins.__dict__['_']
00215             if "ngettext" in names:
00216                 builtins.__dict__['ngettext'] = self.ngettext
00217             if "lgettext" in names:
00218                 builtins.__dict__['lgettext'] = self.lgettext
00219             if "lngettext" in names:
00220                 builtins.__dict__['lngettext'] = self.lngettext
00221 
00222 
00223 class GNUTranslations(NullTranslations):
00224     # Magic number of .mo files
00225     LE_MAGIC = 0x950412de
00226     BE_MAGIC = 0xde120495
00227 
00228     def _parse(self, fp):
00229         """Override this method to support alternative .mo formats."""
00230         unpack = struct.unpack
00231         filename = getattr(fp, 'name', '')
00232         # Parse the .mo file header, which consists of 5 little endian 32
00233         # bit words.
00234         self._catalog = catalog = {}
00235         self.plural = lambda n: int(n != 1) # germanic plural by default
00236         buf = fp.read()
00237         buflen = len(buf)
00238         # Are we big endian or little endian?
00239         magic = unpack('<I', buf[:4])[0]
00240         if magic == self.LE_MAGIC:
00241             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
00242             ii = '<II'
00243         elif magic == self.BE_MAGIC:
00244             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
00245             ii = '>II'
00246         else:
00247             raise IOError(0, 'Bad magic number', filename)
00248         # Now put all messages from the .mo file buffer into the catalog
00249         # dictionary.
00250         for i in range(0, msgcount):
00251             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
00252             mend = moff + mlen
00253             tlen, toff = unpack(ii, buf[transidx:transidx+8])
00254             tend = toff + tlen
00255             if mend < buflen and tend < buflen:
00256                 msg = buf[moff:mend]
00257                 tmsg = buf[toff:tend]
00258             else:
00259                 raise IOError(0, 'File is corrupt', filename)
00260             # See if we're looking at GNU .mo conventions for metadata
00261             if mlen == 0:
00262                 # Catalog description
00263                 lastk = k = None
00264                 for b_item in tmsg.split('\n'.encode("ascii")):
00265                     item = b_item.decode().strip()
00266                     if not item:
00267                         continue
00268                     if ':' in item:
00269                         k, v = item.split(':', 1)
00270                         k = k.strip().lower()
00271                         v = v.strip()
00272                         self._info[k] = v
00273                         lastk = k
00274                     elif lastk:
00275                         self._info[lastk] += '\n' + item
00276                     if k == 'content-type':
00277                         self._charset = v.split('charset=')[1]
00278                     elif k == 'plural-forms':
00279                         v = v.split(';')
00280                         plural = v[1].split('plural=')[1]
00281                         self.plural = c2py(plural)
00282             # Note: we unconditionally convert both msgids and msgstrs to
00283             # Unicode using the character encoding specified in the charset
00284             # parameter of the Content-Type header.  The gettext documentation
00285             # strongly encourages msgids to be us-ascii, but some applications
00286             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
00287             # traditional gettext applications, the msgid conversion will
00288             # cause no problems since us-ascii should always be a subset of
00289             # the charset encoding.  We may want to fall back to 8-bit msgids
00290             # if the Unicode conversion fails.
00291             charset = self._charset or 'ascii'
00292             if b'\x00' in msg:
00293                 # Plural forms
00294                 msgid1, msgid2 = msg.split(b'\x00')
00295                 tmsg = tmsg.split(b'\x00')
00296                 msgid1 = str(msgid1, charset)
00297                 for i, x in enumerate(tmsg):
00298                     catalog[(msgid1, i)] = str(x, charset)
00299             else:
00300                 catalog[str(msg, charset)] = str(tmsg, charset)
00301             # advance to next entry in the seek tables
00302             masteridx += 8
00303             transidx += 8
00304 
00305     def lgettext(self, message):
00306         missing = object()
00307         tmsg = self._catalog.get(message, missing)
00308         if tmsg is missing:
00309             if self._fallback:
00310                 return self._fallback.lgettext(message)
00311             return message
00312         if self._output_charset:
00313             return tmsg.encode(self._output_charset)
00314         return tmsg.encode(locale.getpreferredencoding())
00315 
00316     def lngettext(self, msgid1, msgid2, n):
00317         try:
00318             tmsg = self._catalog[(msgid1, self.plural(n))]
00319             if self._output_charset:
00320                 return tmsg.encode(self._output_charset)
00321             return tmsg.encode(locale.getpreferredencoding())
00322         except KeyError:
00323             if self._fallback:
00324                 return self._fallback.lngettext(msgid1, msgid2, n)
00325             if n == 1:
00326                 return msgid1
00327             else:
00328                 return msgid2
00329 
00330     def gettext(self, message):
00331         missing = object()
00332         tmsg = self._catalog.get(message, missing)
00333         if tmsg is missing:
00334             if self._fallback:
00335                 return self._fallback.gettext(message)
00336             return message
00337         return tmsg
00338 
00339     def ngettext(self, msgid1, msgid2, n):
00340         try:
00341             tmsg = self._catalog[(msgid1, self.plural(n))]
00342         except KeyError:
00343             if self._fallback:
00344                 return self._fallback.ngettext(msgid1, msgid2, n)
00345             if n == 1:
00346                 tmsg = msgid1
00347             else:
00348                 tmsg = msgid2
00349         return tmsg
00350 
00351 
00352 # Locate a .mo file using the gettext strategy
00353 def find(domain, localedir=None, languages=None, all=False):
00354     # Get some reasonable defaults for arguments that were not supplied
00355     if localedir is None:
00356         localedir = _default_localedir
00357     if languages is None:
00358         languages = []
00359         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
00360             val = os.environ.get(envar)
00361             if val:
00362                 languages = val.split(':')
00363                 break
00364         if 'C' not in languages:
00365             languages.append('C')
00366     # now normalize and expand the languages
00367     nelangs = []
00368     for lang in languages:
00369         for nelang in _expand_lang(lang):
00370             if nelang not in nelangs:
00371                 nelangs.append(nelang)
00372     # select a language
00373     if all:
00374         result = []
00375     else:
00376         result = None
00377     for lang in nelangs:
00378         if lang == 'C':
00379             break
00380         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
00381         if os.path.exists(mofile):
00382             if all:
00383                 result.append(mofile)
00384             else:
00385                 return mofile
00386     return result
00387 
00388 
00389 
00390 # a mapping between absolute .mo file path and Translation object
00391 _translations = {}
00392 
00393 def translation(domain, localedir=None, languages=None,
00394                 class_=None, fallback=False, codeset=None):
00395     if class_ is None:
00396         class_ = GNUTranslations
00397     mofiles = find(domain, localedir, languages, all=True)
00398     if not mofiles:
00399         if fallback:
00400             return NullTranslations()
00401         raise IOError(ENOENT, 'No translation file found for domain', domain)
00402     # Avoid opening, reading, and parsing the .mo file after it's been done
00403     # once.
00404     result = None
00405     for mofile in mofiles:
00406         key = (class_, os.path.abspath(mofile))
00407         t = _translations.get(key)
00408         if t is None:
00409             with open(mofile, 'rb') as fp:
00410                 t = _translations.setdefault(key, class_(fp))
00411         # Copy the translation object to allow setting fallbacks and
00412         # output charset. All other instance data is shared with the
00413         # cached object.
00414         t = copy.copy(t)
00415         if codeset:
00416             t.set_output_charset(codeset)
00417         if result is None:
00418             result = t
00419         else:
00420             result.add_fallback(t)
00421     return result
00422 
00423 
00424 def install(domain, localedir=None, codeset=None, names=None):
00425     t = translation(domain, localedir, fallback=True, codeset=codeset)
00426     t.install(names)
00427 
00428 
00429 
00430 # a mapping b/w domains and locale directories
00431 _localedirs = {}
00432 # a mapping b/w domains and codesets
00433 _localecodesets = {}
00434 # current global domain, `messages' used for compatibility w/ GNU gettext
00435 _current_domain = 'messages'
00436 
00437 
00438 def textdomain(domain=None):
00439     global _current_domain
00440     if domain is not None:
00441         _current_domain = domain
00442     return _current_domain
00443 
00444 
00445 def bindtextdomain(domain, localedir=None):
00446     global _localedirs
00447     if localedir is not None:
00448         _localedirs[domain] = localedir
00449     return _localedirs.get(domain, _default_localedir)
00450 
00451 
00452 def bind_textdomain_codeset(domain, codeset=None):
00453     global _localecodesets
00454     if codeset is not None:
00455         _localecodesets[domain] = codeset
00456     return _localecodesets.get(domain)
00457 
00458 
00459 def dgettext(domain, message):
00460     try:
00461         t = translation(domain, _localedirs.get(domain, None),
00462                         codeset=_localecodesets.get(domain))
00463     except IOError:
00464         return message
00465     return t.gettext(message)
00466 
00467 def ldgettext(domain, message):
00468     try:
00469         t = translation(domain, _localedirs.get(domain, None),
00470                         codeset=_localecodesets.get(domain))
00471     except IOError:
00472         return message
00473     return t.lgettext(message)
00474 
00475 def dngettext(domain, msgid1, msgid2, n):
00476     try:
00477         t = translation(domain, _localedirs.get(domain, None),
00478                         codeset=_localecodesets.get(domain))
00479     except IOError:
00480         if n == 1:
00481             return msgid1
00482         else:
00483             return msgid2
00484     return t.ngettext(msgid1, msgid2, n)
00485 
00486 def ldngettext(domain, msgid1, msgid2, n):
00487     try:
00488         t = translation(domain, _localedirs.get(domain, None),
00489                         codeset=_localecodesets.get(domain))
00490     except IOError:
00491         if n == 1:
00492             return msgid1
00493         else:
00494             return msgid2
00495     return t.lngettext(msgid1, msgid2, n)
00496 
00497 def gettext(message):
00498     return dgettext(_current_domain, message)
00499 
00500 def lgettext(message):
00501     return ldgettext(_current_domain, message)
00502 
00503 def ngettext(msgid1, msgid2, n):
00504     return dngettext(_current_domain, msgid1, msgid2, n)
00505 
00506 def lngettext(msgid1, msgid2, n):
00507     return ldngettext(_current_domain, msgid1, msgid2, n)
00508 
00509 # dcgettext() has been deemed unnecessary and is not implemented.
00510 
00511 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
00512 # was:
00513 #
00514 #    import gettext
00515 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
00516 #    _ = cat.gettext
00517 #    print _('Hello World')
00518 
00519 # The resulting catalog object currently don't support access through a
00520 # dictionary API, which was supported (but apparently unused) in GNOME
00521 # gettext.
00522 
00523 Catalog = translation