Back to index

plone3  3.1.7
__init__.py
Go to the documentation of this file.
00001 import re
00002 
00003 from plone.i18n.normalizer.base import baseNormalize
00004 from plone.i18n.normalizer.interfaces import IIDNormalizer
00005 from plone.i18n.normalizer.interfaces import IFileNameNormalizer
00006 from plone.i18n.normalizer.interfaces import IURLNormalizer
00007 
00008 from zope.component import queryUtility
00009 from zope.interface import implements
00010 
00011 # Define and compile static regexes
00012 FILENAME_REGEX = re.compile(r"^(.+)\.(\w{,4})$")
00013 IGNORE_REGEX = re.compile(r"['\"]")
00014 NON_WORD_REGEX = re.compile(r"[\W\-]+")
00015 DANGEROUS_CHARS_REGEX = re.compile(r"[!$%&()*+,/:;<=>?@\\^{|}\[\]~`]+")
00016 URL_DANGEROUS_CHARS_REGEX = re.compile(r"[!#$%&()*+,/:;<=>?@\\^{|}\[\]~`]+")
00017 MULTIPLE_DASHES_REGEX = re.compile(r"\-+")
00018 EXTRA_DASHES_REGEX = re.compile(r"(^\-+)|(\-+$)")
00019 #Define static constraints
00020 MAX_LENGTH = 50
00021 MAX_FILENAME_LENGTH = 1023
00022 MAX_URL_LENGTH = 255
00023 
00024 
00025 def cropName(base, maxLength=MAX_LENGTH):
00026     baseLength = len(base)
00027 
00028     index = baseLength
00029     while index > maxLength:
00030         index = base.rfind('-', 0, index)
00031 
00032     if index == -1 and baseLength > maxLength:
00033         base = base[:maxLength]
00034 
00035     elif index > 0:
00036         base = base[:index]
00037 
00038     return base
00039 
00040 
00041 class IDNormalizer(object):
00042     """
00043     This normalizer can normalize any unicode string and returns a
00044     version that only contains of ASCII characters allowed in a typical
00045     scripting or programming language id, such as CSS class names or Python
00046     variable names for example.
00047 
00048     Let's make sure that this implementation actually fulfills the API.
00049 
00050       >>> from zope.interface.verify import verifyClass
00051       >>> verifyClass(IIDNormalizer, IDNormalizer)
00052       True
00053     """
00054     implements(IIDNormalizer)
00055 
00056     def normalize(self, text, locale=None, max_length=MAX_LENGTH):
00057         """
00058         Returns a normalized text. text has to be a unicode string and locale
00059         should be a normal locale, for example: 'pt_BR', 'sr@Latn' or 'de'
00060         """
00061         if locale is not None:
00062             # Try to get a normalizer for the locale
00063             util = queryUtility(IIDNormalizer, name=locale)
00064             parts = locale.split('_')
00065             if util is None and len(parts) > 1:
00066                 # Try to get a normalizer for the base language if we asked
00067                 # for one for a language/country combination and found none
00068                 util = queryUtility(IIDNormalizer, name=parts[0])
00069             if util is not None:
00070                 text = util.normalize(text, locale=locale)
00071 
00072         text = baseNormalize(text)
00073 
00074         # lowercase text
00075         base = text.lower()
00076         ext  = ''
00077 
00078         # replace whitespace and punctuation, but preserve filename extensions
00079         m = FILENAME_REGEX.match(text)
00080         if m is not None:
00081             base = m.groups()[0]
00082             ext  = m.groups()[1]
00083 
00084         base = IGNORE_REGEX.sub('', base)
00085         base = NON_WORD_REGEX.sub('-', base)
00086         base = MULTIPLE_DASHES_REGEX.sub('-', base)
00087         base = EXTRA_DASHES_REGEX.sub('', base)
00088 
00089         base = cropName(base, maxLength=max_length)
00090         
00091         if ext != '':
00092             base = base + '.' + ext
00093 
00094         return base
00095 
00096 
00097 class FileNameNormalizer(object):
00098     """
00099     This normalizer can normalize any unicode string and returns a version
00100     that only contains of ASCII characters allowed in a file name.
00101 
00102     Let's make sure that this implementation actually fulfills the API.
00103 
00104       >>> from zope.interface.verify import verifyClass
00105       >>> verifyClass(IFileNameNormalizer, FileNameNormalizer)
00106       True
00107     """
00108     implements(IFileNameNormalizer)
00109 
00110     def normalize(self, text, locale=None, max_length=MAX_FILENAME_LENGTH):
00111         """
00112         Returns a normalized text. text has to be a unicode string and locale
00113         should be a normal locale, for example: 'pt_BR', 'sr@Latn' or 'de'
00114         """
00115         if locale is not None:
00116             # Try to get a normalizer for the locale
00117             util = queryUtility(IFileNameNormalizer, name=locale)
00118             parts = locale.split('_')
00119             if util is None and len(parts) > 1:
00120                 # Try to get a normalizer for the base language if we asked
00121                 # for one for a language/country combination and found none
00122                 util = queryUtility(IFileNameNormalizer, name=parts[0])
00123             if util is not None:
00124                 text = util.normalize(text, locale=locale)
00125 
00126         # Preserve filename extensions
00127         base = text = baseNormalize(text)
00128         ext  = ''
00129 
00130         m = FILENAME_REGEX.match(text)
00131         if m is not None:
00132             base = m.groups()[0]
00133             ext  = m.groups()[1]
00134 
00135         base = IGNORE_REGEX.sub('', base)
00136         base = DANGEROUS_CHARS_REGEX.sub('-', base)
00137         base = EXTRA_DASHES_REGEX.sub('', base)
00138         base = MULTIPLE_DASHES_REGEX.sub('-', base)
00139 
00140         base = cropName(base, maxLength=max_length)
00141 
00142         if ext != '':
00143             base = base + '.' + ext
00144 
00145         return base
00146 
00147 
00148 class URLNormalizer(object):
00149     """
00150     This normalizer can normalize any unicode string and returns a URL-safe
00151     version that only contains of ASCII characters allowed in a URL.
00152 
00153     Let's make sure that this implementation actually fulfills the API.
00154 
00155       >>> from zope.interface.verify import verifyClass
00156       >>> verifyClass(IURLNormalizer, URLNormalizer)
00157       True
00158     """
00159     implements(IURLNormalizer)
00160 
00161     def normalize(self, text, locale=None, max_length=MAX_URL_LENGTH):
00162         """
00163         Returns a normalized text. text has to be a unicode string and locale
00164         should be a normal locale, for example: 'pt_BR', 'sr@Latn' or 'de'
00165         """
00166         if locale is not None:
00167             # Try to get a normalizer for the locale
00168             util = queryUtility(IURLNormalizer, name=locale)
00169             parts = locale.split('_')
00170             if util is None and len(parts) > 1:
00171                 # Try to get a normalizer for the base language if we asked
00172                 # for one for a language/country combination and found none
00173                 util = queryUtility(IURLNormalizer, name=parts[0])
00174             if util is not None:
00175                 text = util.normalize(text, locale=locale)
00176 
00177         text = baseNormalize(text)
00178 
00179         # lowercase text
00180         base = text.lower()
00181         ext  = ''
00182 
00183         m = FILENAME_REGEX.match(base)
00184         if m is not None:
00185             base = m.groups()[0]
00186             ext  = m.groups()[1]
00187 
00188         base = base.replace(' ', '-')
00189         base = IGNORE_REGEX.sub('', base)
00190         base = URL_DANGEROUS_CHARS_REGEX.sub('-', base)
00191         base = EXTRA_DASHES_REGEX.sub('', base)
00192         base = MULTIPLE_DASHES_REGEX.sub('-', base)
00193 
00194         base = cropName(base, maxLength=max_length)
00195 
00196         if ext != '':
00197             base = base + '.' + ext
00198 
00199         return base
00200 
00201 idnormalizer = IDNormalizer()
00202 filenamenormalizer = FileNameNormalizer()
00203 urlnormalizer = URLNormalizer()