Back to index

plone3  3.1.7
nationalizer.py
Go to the documentation of this file.
00001 #!/usr/bin/python2.3
00002 
00003 """Return the Kupu .html file with i18n applied"""
00004 
00005 from xml.dom.minidom import parseString
00006 import os
00007 
00008 ID = 0
00009 STR = 1
00010 
00011 I18NNS = 'http://xml.zope.org/namespaces/i18n'
00012 
00013 def ustr(i):
00014     if type(i) == unicode:
00015         return i
00016     else:
00017         return unicode(str(i), 'UTF-8')
00018 
00019 def get_locale():
00020     if os.environ.has_key('HTTP_ACCEPT_LANGUAGE'):
00021         charsets = [l.strip() for l in 
00022                 os.environ['HTTP_ACCEPT_LANGUAGE'].split(';')[0].split(',')]
00023         return charsets
00024 
00025 class Nationalizer:
00026     """Translates string in an HTML or XML file using i18n: directives"""
00027 
00028     not_single = ['a', 'abbr', 'acronym', 'address', 'applet', 
00029                     'b', 'bdo', 'big', 'blink', 'blockquote', 
00030                     'button', 'caption', 'center', 'cite', 
00031                     'comment', 'del', 'dfn', 'dir', 'div',
00032                     'dl', 'dt', 'em', 'embed', 'fieldset',
00033                     'font', 'form', 'frameset', 'h1', 'h2',
00034                     'h3', 'h4', 'h5', 'h6', 'i', 'iframe',
00035                     'ins', 'kbd', 'label', 'legend', 'li',
00036                     'listing', 'map', 'marquee', 'menu',
00037                     'multicol', 'nobr', 'noembed', 'noframes',
00038                     'noscript', 'object', 'ol', 'optgroup',
00039                     'option', 'p', 'pre', 'q', 's', 'script',
00040                     'select', 'small', 'span', 'strike', 
00041                     'strong', 'style', 'sub', 'sup', 'table',
00042                     'tbody', 'td', 'textarea', 'tfoot',
00043                     'th', 'thead', 'title', 'tr', 'tt', 'u',
00044                     'ul', 'xmp']
00045 
00046     def __init__(self, htmlfile, locale):
00047         self.htmlfile = htmlfile
00048         self.locale = locale
00049 
00050     def translate(self):
00051         """load and translate everything"""
00052         popath = self.get_po_file_path(self.locale)
00053         if popath is not None:
00054             pofp = open(popath)
00055             try:
00056                 msgcat = self.parse_po_file(pofp)
00057             finally:
00058                 pofp.close()
00059         else:
00060             # if no pofile, parse anyway to get rid of those nasty i18n:
00061             # attributes (obviously not very fast, perhaps we need to either
00062             # cache a parsed version and send that back or just remove the
00063             # attributes here)
00064             msgcat = {}
00065         xmlfp = open(self.htmlfile)
00066         try:
00067             xml = xmlfp.read()
00068         finally:
00069             xmlfp.close()
00070         dom = parseString(xml)
00071         self.apply_i18n(dom, msgcat)
00072         return self.serialize(dom.documentElement)
00073 
00074     def parse_po_file(self, pofp):
00075         """parse the .po file, create a mapping msgid->msgstr"""
00076         cat = {}
00077         state = None
00078         msgid = None
00079         msgstr = None
00080         for line in pofp.readlines():
00081             line = line.strip()
00082             if line.startswith('#') or not line:
00083                 continue
00084             if line.startswith('msgid'):
00085                 if msgid and msgstr:
00086                     cat[msgid] = msgstr
00087                 msgid = line[7:-1]
00088                 state = ID
00089             elif line.startswith('msgstr'):
00090                 msgstr = line[8:-1]
00091             else:
00092                 # ignore for now, might be a multiline msgstr, if we
00093                 # want to support those we should add some code here...
00094                 pass
00095         if msgid and msgstr:
00096             cat[msgid] = msgstr
00097         return cat
00098 
00099     def apply_i18n(self, dom, msgcat):
00100         """apply nationalization of the full dom"""
00101         nodes = dom.documentElement.getElementsByTagName('*')
00102         for node in nodes:
00103             if node.hasAttributeNS(I18NNS, 'translate'):
00104                 self.apply_translate(node, msgcat)
00105             if node.hasAttributeNS(I18NNS, 'attributes'):
00106                 self.apply_attributes(node, msgcat)
00107 
00108     def apply_translate(self, node, msgcat):
00109         """handle Zope-style i18n:translate"""
00110         buf = []
00111         msgid = msgstr = node.getAttributeNS(I18NNS, 'translate').strip()
00112         if not msgid:
00113             # no msgid in the attribute, use the node value
00114             for child in node.childNodes:
00115                 if child.nodeType == 3:
00116                     buf.append(child.nodeValue)
00117                 else:
00118                     raise TypeError, \
00119                         ('illegal element %s in i18n:translate element' % 
00120                             child.nodeName)
00121             msgid = msgstr = self.reduce_whitespace(u''.join(buf).strip())
00122         if msgcat.has_key(msgid):
00123             msgstr = msgcat[msgid]
00124         # now replace the contents of the node with the new contents
00125         while node.hasChildNodes():
00126             node.removeChild(node.firstChild)
00127         node.removeAttributeNS(I18NNS, 'translate')
00128         node.appendChild(node.ownerDocument.createTextNode(msgstr))
00129 
00130     def apply_attributes(self, node, msgcat):
00131         """handle Zope-style i18n:attributes"""
00132         attrnames = node.getAttributeNS(I18NNS, 'attributes').split(' ')
00133         for attr in attrnames:
00134             value = node.getAttribute(attr)
00135             if value and msgcat.has_key(value):
00136                 node.setAttribute(attr, unicode(msgcat[value], 'UTF-8'))
00137         node.removeAttributeNS(I18NNS, 'attributes')
00138 
00139     def reduce_whitespace(self, string):
00140         for char in ['\n', '\t', '\r']:
00141             string  = string.replace(char, ' ')
00142         while string.find('  ') > -1:
00143             string = string.replace('  ', ' ')
00144         return string
00145 
00146     def get_po_file_path(self, locale):
00147         for language in locale:
00148             startdir = '../i18n'
00149             language = language.split('-')
00150             pathstart = '%s/kupu-%s' % (startdir, language[0])
00151             paths = []
00152             if len(language) == 2:
00153                 paths.append('%s-%s.po' % (pathstart, language[1]))
00154             paths += [
00155                 '%s-default.po' % pathstart,
00156                 '%s.po' % pathstart,
00157                 ]
00158             for path in paths:
00159                 if os.path.isfile(path):
00160                     return path
00161 
00162     def serialize(self, el):
00163         buf = []
00164         if el.nodeType == 1:
00165             buf.append('<%s' % el.nodeName)
00166             if len(el.attributes):
00167                 for attr, value in el.attributes.items():
00168                     if value is not None:
00169                         buf.append(' %s="%s"' % (attr, self.entitize(value)))
00170             if el.hasChildNodes() or el.nodeName in self.not_single:
00171                 buf.append('>')
00172                 for child in el.childNodes:
00173                     buf += self.serialize(child)
00174                 buf.append('</%s>' % el.nodeName)
00175             else:
00176                 buf.append(' />')
00177         elif el.nodeType == 3:
00178             buf.append(el.nodeValue)
00179         else:
00180             pass #print 'ignoring node of type', el.nodeType
00181         return ''.join([ustr(b) for b in buf])
00182 
00183     def entitize(self, string):
00184         string = string.replace('&', '&amp;')
00185         string = string.replace('<', '&lt;')
00186         string = string.replace('>', '&gt;')
00187         string = string.replace('"', '&quot;')
00188         return string
00189         
00190 if __name__ == '__main__':
00191     # test code
00192     os.chdir(os.path.abspath(os.path.dirname(__file__)))
00193     i = Nationalizer('../common/kupu.html', ['nl'])
00194     print i.translate().encode('UTF-8')