Back to index

plone3  3.1.7
base.py
Go to the documentation of this file.
00001 import string
00002 from unicodedata import normalize, decomposition
00003 
00004 # Latin characters with accents, etc.
00005 mapping = {
00006 138 : 's', 140 : 'O', 142 : 'z', 154 : 's', 156 : 'o', 158 : 'z', 159 : 'Y',
00007 192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 196 : 'A', 197 : 'Aa', 198 : 'E',
00008 199 : 'C', 200 : 'E', 201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I',
00009 206 : 'I', 207 : 'I', 208 : 'Th', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O',
00010 213 : 'O', 214 : 'O', 215 : 'x', 216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U',
00011 220 : 'U', 222 : 'th', 221 : 'Y', 223 : 's', 224 : 'a', 225 : 'a', 226 : 'a',
00012 227 : 'a', 228 : 'ae', 229 : 'aa', 230 : 'ae', 231 : 'c', 232 : 'e', 233 : 'e',
00013 234 : 'e', 235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'th',
00014 241 : 'n', 242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 246 : 'oe', 248 : 'oe',
00015 249 : 'u', 250 : 'u', 251 : 'u', 252 : 'u', 253 : 'y', 254 : 'Th', 255 : 'y' }
00016 
00017 # On OpenBSD string.whitespace has a non-standard implementation
00018 # See http://dev.plone.org/plone/ticket/4704 for details
00019 whitespace = ''.join([c for c in string.whitespace if ord(c) < 128])
00020 allowed = string.ascii_letters + string.digits + string.punctuation + whitespace
00021 
00022 def mapUnicode(text, mapping={}):
00023     """
00024     This method is used for replacement of special characters found in a mapping
00025     before baseNormalize is applied.
00026     """
00027     res = u''
00028     for ch in text:
00029         ordinal = ord(ch)
00030         if mapping.has_key(ordinal):
00031             # try to apply custom mappings
00032             res += mapping.get(ordinal)
00033         else:
00034             # else leave untouched
00035             res += ch
00036     # always apply base normalization
00037     return baseNormalize(res)
00038 
00039 def baseNormalize(text):
00040     """
00041     This method is used for normalization of unicode characters to the base ASCII
00042     letters. Output is ASCII encoded string (or char) with only ASCII letters,
00043     digits, punctuation and whitespace characters. Case is preserved.
00044 
00045       >>> baseNormalize(123)
00046       '123'
00047 
00048       >>> baseNormalize(u'\u0fff')
00049       'fff'
00050 
00051       >>> baseNormalize(u"foo\N{LATIN CAPITAL LETTER I WITH CARON}")
00052       'fooI'
00053     """
00054     if not isinstance(text, basestring):
00055         # This most surely ends up in something the user does not expect
00056         # to see. But at least it does not break.
00057         return repr(text)
00058 
00059     text = text.strip()
00060 
00061     res = u''
00062     for ch in text:
00063         if ch in allowed:
00064             # ASCII chars, digits etc. stay untouched
00065             res += ch
00066         else:
00067             ordinal = ord(ch)
00068             if mapping.has_key(ordinal):
00069                 # try to apply custom mappings
00070                 res += mapping.get(ordinal)
00071             elif decomposition(ch):
00072                 normalized = normalize('NFKD', ch).strip()
00073                 # string may contain non-letter chars too. Remove them
00074                 # string may result to more than one char
00075                 res += ''.join([c for c in normalized if c in allowed])
00076             else:
00077                 # hex string instead of unknown char
00078                 res += "%x" % ordinal
00079 
00080     return res.encode('ascii')