Back to index

plone3  3.1.7
normalize.py
Go to the documentation of this file.
00001 #!/usr/bin/python
00002 ##########################################################
00003 #
00004 # Licensed under the terms of the GNU Public License
00005 # (see docs/LICENSE.GPL)
00006 #
00007 # Copyright (c) 2005:
00008 #   - The Plone Foundation (http://plone.org/foundation/)
00009 #
00010 ##########################################################
00011 
00012 __authors__ = 'Anders Pearson <anders@columbia.edu>'
00013 __docformat__ = 'restructuredtext'
00014 
00015 import re
00016 
00017 mapping = {138: 's', 140: 'OE', 142: 'z', 154: 's', 156: 'oe', 158: 'z', 159: 'Y', 
00018 192: 'A', 193: 'A', 194: 'A', 195: 'A', 196: 'A', 197: 'a', 198: 'E', 199: 'C', 
00019 200: 'E', 201: 'E', 202: 'E', 203: 'E', 204: 'I', 205: 'I', 206: 'I', 207: 'I', 
00020 208: 'D', 209: 'n', 211: 'O', 212: 'O', 214: 'O', 216: 'O', 217: 'U', 218: 'U', 
00021 219: 'U', 220: 'U', 221: 'y', 223: 'ss', 224: 'a', 225: 'a', 226: 'a', 227: 'a', 
00022 228: 'a', 229: 'a', 230: 'e', 231: 'c', 232: 'e', 233: 'e', 234: 'e', 235: 'e', 
00023 236: 'i', 237: 'i', 238: 'i', 239: 'i', 240: 'd', 241: 'n', 243: 'o', 244: 'o', 
00024 246: 'o', 248: 'o', 249: 'u', 250: 'u', 251: 'u', 252: 'u', 253: 'y', 255: 'y'}
00025 
00026 
00027 def normalizeISO(text=""):
00028     fixed = []
00029     for c in list(text):
00030         if ord(c) < 256:
00031             c = mapping.get(ord(c),c)
00032         else:
00033             c = "%x" % ord(c)
00034         fixed.append(c)
00035     return "".join(fixed)
00036 
00037 
00038 pattern1 = re.compile(r"^([^\.]+)\.(\w{,4})$")
00039 pattern2 = re.compile(r'r"([\W\-]+)"')
00040 def titleToNormalizedId(title=""):
00041     title = title.lower()
00042     title = title.strip()
00043     title = normalizeISO(title)
00044     base = title
00045     ext = ""
00046     m = pattern1.match(title)
00047     if m:
00048         base = m.groups()[0]
00049         ext = m.groups()[1]
00050     parts = pattern2.split(base)
00051         
00052     slug = re.sub(r"[\W\-]+","-",base)
00053     slug = re.sub(r"^\-+","",slug)
00054     slug = re.sub(r"\-+$","",slug)
00055     if ext != "":
00056         slug = slug + "." + ext
00057     return slug
00058 
00059 
00060 tests = [
00061 (u"This is a normal title.", "this-is-a-normal-title"),
00062 (u"Short sentence. Big thoughts.", "short-sentence-big-thoughts"),
00063 (u"Some298374NUMBER", "some298374number"),
00064 (u'Eksempel \xe6\xf8\xe5 norsk \xc6\xd8\xc5', u'eksempel-aoa-norsk-aoa'), 
00065 (u'\u9ad8\u8054\u5408 Chinese', u'2837821-chinese'), 
00066 (u'\u30a2\u30ec\u30af\u30b5\u30f3\u30c0\u30fc\u3000\u30ea\u30df Japanese', u'23987643-japanese'), 
00067 (u'\uc774\ubbf8\uc9f1 Korean', u'987342-korean'), 
00068 (u'\u0e2d\u0e40\u0e25\u0e47\u0e01\u0e0b\u0e32\u0e19\u0e40\u0e14\u0e2d\u0e23\u0e4c \u0e25\u0e35\u0e21 Thai', u'7265837-thai'), 
00069 (u'About folder.gif', u'about-folder.gif')]
00070 
00071 if __name__ == "__main__":
00072     import profile
00073 
00074     for original,correct in tests:
00075         sanitized = titleToNormalizedId(original)
00076         print sanitized
00077 
00078