Back to index

plone3  3.1.7
parser.py
Go to the documentation of this file.
00001 # -*- coding: latin-1 -*-
00002 
00003 """
00004 This module parses and generates contentlines as defined in RFC 2445
00005 (iCalendar), but will probably work for other MIME types with similar syntax.
00006 Eg. RFC 2426 (vCard)
00007 
00008 It is stupid in the sense that it treats the content purely as strings. No type
00009 conversion is attempted.
00010 
00011 Copyright, 2005: Max M <maxm@mxm.dk>
00012 License: GPL (Just contact med if and why you would like it changed)
00013 """
00014 
00015 # from python
00016 from types import TupleType, ListType
00017 SequenceTypes = [TupleType, ListType]
00018 import re
00019 # from this package
00020 from icalendar.caselessdict import CaselessDict
00021 
00022 
00023 #################################################################
00024 # Property parameter stuff
00025 
00026 def paramVal(val):
00027     "Returns a parameter value"
00028     if type(val) in SequenceTypes:
00029         return q_join(val)
00030     return dQuote(val)
00031 
00032 # Could be improved
00033 NAME = re.compile('[\w-]+')
00034 UNSAFE_CHAR = re.compile('[\x00-\x08\x0a-\x1f\x7F",:;]')
00035 QUNSAFE_CHAR = re.compile('[\x00-\x08\x0a-\x1f\x7F"]')
00036 FOLD = re.compile('([\r]?\n)+[ \t]{1}')
00037 
00038 def validate_token(name):
00039     match = NAME.findall(name)
00040     if len(match) == 1 and name == match[0]:
00041         return
00042     raise ValueError, name
00043 
00044 def validate_param_value(value, quoted=True):
00045     validator = UNSAFE_CHAR
00046     if quoted:
00047         validator = QUNSAFE_CHAR
00048     if validator.findall(value):
00049         raise ValueError, value
00050 
00051 QUOTABLE = re.compile('[,;:].')
00052 def dQuote(val):
00053     """
00054     Parameter values containing [,;:] must be double quoted
00055     >>> dQuote('Max')
00056     'Max'
00057     >>> dQuote('Rasmussen, Max')
00058     '"Rasmussen, Max"'
00059     >>> dQuote('name:value')
00060     '"name:value"'
00061     """
00062     if QUOTABLE.search(val):
00063         return '"%s"' % val
00064     return val
00065 
00066 # parsing helper
00067 def q_split(st, sep=','):
00068     """
00069     Splits a string on char, taking double (q)uotes into considderation
00070     >>> q_split('Max,Moller,"Rasmussen, Max"')
00071     ['Max', 'Moller', '"Rasmussen, Max"']
00072     """
00073     result = []
00074     cursor = 0
00075     length = len(st)
00076     inquote = 0
00077     for i in range(length):
00078         ch = st[i]
00079         if ch == '"':
00080             inquote = not inquote
00081         if not inquote and ch == sep:
00082             result.append(st[cursor:i])
00083             cursor = i + 1
00084         if i + 1 == length:
00085             result.append(st[cursor:])
00086     return result
00087 
00088 def q_join(lst, sep=','):
00089     """
00090     Joins a list on sep, quoting strings with QUOTABLE chars
00091     >>> s = ['Max', 'Moller', 'Rasmussen, Max']
00092     >>> q_join(s)
00093     'Max,Moller,"Rasmussen, Max"'
00094     """
00095     return sep.join([dQuote(itm) for itm in lst])
00096 
00097 class Parameters(CaselessDict):
00098     """
00099     Parser and generator of Property parameter strings. It knows nothing of
00100     datatypes. It's main concern is textual structure.
00101 
00102 
00103     Simple parameter:value pair
00104     >>> p = Parameters(parameter1='Value1')
00105     >>> str(p)
00106     'PARAMETER1=Value1'
00107 
00108 
00109     keys are converted to upper
00110     >>> p.keys()
00111     ['PARAMETER1']
00112 
00113 
00114     Parameters are case insensitive
00115     >>> p['parameter1']
00116     'Value1'
00117     >>> p['PARAMETER1']
00118     'Value1'
00119 
00120 
00121     Parameter with list of values must be seperated by comma
00122     >>> p = Parameters({'parameter1':['Value1', 'Value2']})
00123     >>> str(p)
00124     'PARAMETER1=Value1,Value2'
00125 
00126 
00127     Multiple parameters must be seperated by a semicolon
00128     >>> p = Parameters({'RSVP':'TRUE', 'ROLE':'REQ-PARTICIPANT'})
00129     >>> str(p)
00130     'ROLE=REQ-PARTICIPANT;RSVP=TRUE'
00131 
00132 
00133     Parameter values containing ',;:' must be double quoted
00134     >>> p = Parameters({'ALTREP':'http://www.wiz.org'})
00135     >>> str(p)
00136     'ALTREP="http://www.wiz.org"'
00137 
00138 
00139     list items must be quoted seperately
00140     >>> p = Parameters({'MEMBER':['MAILTO:projectA@host.com', 'MAILTO:projectB@host.com', ]})
00141     >>> str(p)
00142     'MEMBER="MAILTO:projectA@host.com","MAILTO:projectB@host.com"'
00143 
00144     Now the whole sheebang
00145     >>> p = Parameters({'parameter1':'Value1', 'parameter2':['Value2', 'Value3'],\
00146                           'ALTREP':['http://www.wiz.org', 'value4']})
00147     >>> str(p)
00148     'ALTREP="http://www.wiz.org",value4;PARAMETER1=Value1;PARAMETER2=Value2,Value3'
00149 
00150     We can also parse parameter strings
00151     >>> Parameters.from_string('PARAMETER1=Value 1;param2=Value 2')
00152     Parameters({'PARAMETER1': 'Value 1', 'PARAM2': 'Value 2'})
00153 
00154     Including empty strings
00155     >>> Parameters.from_string('param=')
00156     Parameters({'PARAM': ''})
00157 
00158     We can also parse parameter strings
00159     >>> Parameters.from_string('MEMBER="MAILTO:projectA@host.com","MAILTO:projectB@host.com"')
00160     Parameters({'MEMBER': ['MAILTO:projectA@host.com', 'MAILTO:projectB@host.com']})
00161 
00162     We can also parse parameter strings
00163     >>> Parameters.from_string('ALTREP="http://www.wiz.org",value4;PARAMETER1=Value1;PARAMETER2=Value2,Value3')
00164     Parameters({'PARAMETER1': 'Value1', 'ALTREP': ['http://www.wiz.org', 'value4'], 'PARAMETER2': ['Value2', 'Value3']})
00165     """
00166 
00167 
00168     def params(self):
00169         """
00170         in rfc2445 keys are called parameters, so this is to be consitent with
00171         the naming conventions
00172         """
00173         return self.keys()
00174 
00175 ### Later, when I get more time... need to finish this off now. The last majot thing missing.
00176 ###    def _encode(self, name, value, cond=1):
00177 ###        # internal, for conditional convertion of values.
00178 ###        if cond:
00179 ###            klass = types_factory.for_property(name)
00180 ###            return klass(value)
00181 ###        return value
00182 ###
00183 ###    def add(self, name, value, encode=0):
00184 ###        "Add a parameter value and optionally encode it."
00185 ###        if encode:
00186 ###            value = self._encode(name, value, encode)
00187 ###        self[name] = value
00188 ###
00189 ###    def decoded(self, name):
00190 ###        "returns a decoded value, or list of same"
00191 
00192     def __repr__(self):
00193         return 'Parameters(' + dict.__repr__(self) + ')'
00194 
00195 
00196     def __str__(self):
00197         result = []
00198         items = self.items()
00199         items.sort() # To make doctests work
00200         for key, value in items:
00201             value = paramVal(value)
00202             result.append('%s=%s' % (key.upper(), value))
00203         return ';'.join(result)
00204 
00205 
00206     def from_string(st, strict=False):
00207         "Parses the parameter format from ical text format"
00208         try:
00209             # parse into strings
00210             result = Parameters()
00211             for param in q_split(st, ';'):
00212                 key, val =  q_split(param, '=')
00213                 validate_token(key)
00214                 param_values = [v for v in q_split(val, ',')]
00215                 # Property parameter values that are not in quoted
00216                 # strings are case insensitive.
00217                 vals = []
00218                 for v in param_values:
00219                     if v.startswith('"') and v.endswith('"'):
00220                         v = v.strip('"')
00221                         validate_param_value(v, quoted=True)
00222                         vals.append(v)
00223                     else:
00224                         validate_param_value(v, quoted=False)
00225                         if strict:
00226                             vals.append(v.upper())
00227                         else:
00228                             vals.append(v)
00229                 if not vals:
00230                     result[key] = val
00231                 else:
00232                     if len(vals) == 1:
00233                         result[key] = vals[0]
00234                     else:
00235                         result[key] = vals
00236             return result
00237         except:
00238             raise ValueError, 'Not a valid parameter string'
00239     from_string = staticmethod(from_string)
00240 
00241 
00242 #########################################
00243 # parsing and generation of content lines
00244 
00245 class Contentline(str):
00246     """
00247     A content line is basically a string that can be folded and parsed into
00248     parts.
00249 
00250     >>> c = Contentline('Si meliora dies, ut vina, poemata reddit')
00251     >>> str(c)
00252     'Si meliora dies, ut vina, poemata reddit'
00253 
00254     A long line gets folded
00255     >>> c = Contentline(''.join(['123456789 ']*10))
00256     >>> str(c)
00257     '123456789 123456789 123456789 123456789 123456789 123456789 123456789 1234\\r\\n 56789 123456789 123456789 '
00258 
00259     A folded line gets unfolded
00260     >>> c = Contentline.from_string(str(c))
00261     >>> c
00262     '123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789 '
00263 
00264     It can parse itself into parts. Which is a tuple of (name, params, vals)
00265 
00266     >>> c = Contentline('dtstart:20050101T120000')
00267     >>> c.parts()
00268     ('dtstart', Parameters({}), '20050101T120000')
00269 
00270     >>> c = Contentline('dtstart;value=datetime:20050101T120000')
00271     >>> c.parts()
00272     ('dtstart', Parameters({'VALUE': 'datetime'}), '20050101T120000')
00273 
00274     >>> c = Contentline('ATTENDEE;CN=Max Rasmussen;ROLE=REQ-PARTICIPANT:MAILTO:maxm@example.com')
00275     >>> c.parts()
00276     ('ATTENDEE', Parameters({'ROLE': 'REQ-PARTICIPANT', 'CN': 'Max Rasmussen'}), 'MAILTO:maxm@example.com')
00277     >>> str(c)
00278     'ATTENDEE;CN=Max Rasmussen;ROLE=REQ-PARTICIPANT:MAILTO:maxm@example.com'
00279 
00280     and back again
00281     >>> parts = ('ATTENDEE', Parameters({'ROLE': 'REQ-PARTICIPANT', 'CN': 'Max Rasmussen'}), 'MAILTO:maxm@example.com')
00282     >>> Contentline.from_parts(parts)
00283     'ATTENDEE;CN=Max Rasmussen;ROLE=REQ-PARTICIPANT:MAILTO:maxm@example.com'
00284 
00285     and again
00286     >>> parts = ('ATTENDEE', Parameters(), 'MAILTO:maxm@example.com')
00287     >>> Contentline.from_parts(parts)
00288     'ATTENDEE:MAILTO:maxm@example.com'
00289 
00290     A value can also be any of the types defined in PropertyValues
00291     >>> from icalendar.prop import vText
00292     >>> parts = ('ATTENDEE', Parameters(), vText('MAILTO:test@example.com'))
00293     >>> Contentline.from_parts(parts)
00294     'ATTENDEE:MAILTO:test@example.com'
00295 
00296     A value can also be unicode
00297     >>> from icalendar.prop import vText
00298     >>> parts = ('SUMMARY', Parameters(), vText(u'INternational char   '))
00299     >>> Contentline.from_parts(parts)
00300     'SUMMARY:INternational char \\xc3\\xa6 \\xc3\\xb8 \\xc3\\xa5'
00301 
00302     Traversing could look like this.
00303     >>> name, params, vals = c.parts()
00304     >>> name
00305     'ATTENDEE'
00306     >>> vals
00307     'MAILTO:maxm@example.com'
00308     >>> for key, val in params.items():
00309     ...     (key, val)
00310     ('ROLE', 'REQ-PARTICIPANT')
00311     ('CN', 'Max Rasmussen')
00312 
00313     And the traditional failure
00314     >>> c = Contentline('ATTENDEE;maxm@example.com')
00315     >>> c.parts()
00316     Traceback (most recent call last):
00317         ...
00318     ValueError: Content line could not be parsed into parts
00319 
00320     Another failure:
00321     >>> c = Contentline(':maxm@example.com')
00322     >>> c.parts()
00323     Traceback (most recent call last):
00324         ...
00325     ValueError: Content line could not be parsed into parts
00326 
00327     >>> c = Contentline('key;param=:value')
00328     >>> c.parts()
00329     ('key', Parameters({'PARAM': ''}), 'value')
00330 
00331     >>> c = Contentline('key;param="pvalue":value')
00332     >>> c.parts()
00333     ('key', Parameters({'PARAM': 'pvalue'}), 'value')
00334 
00335     Should bomb on missing param:
00336     >>> c = Contentline.from_string("k;:no param")
00337     >>> c.parts()
00338     Traceback (most recent call last):
00339         ...
00340     ValueError: Content line could not be parsed into parts
00341 
00342     >>> c = Contentline('key;param=pvalue:value', strict=False)
00343     >>> c.parts()
00344     ('key', Parameters({'PARAM': 'pvalue'}), 'value')
00345 
00346     If strict is set to True, uppercase param values that are not
00347     double-quoted, this is because the spec says non-quoted params are
00348     case-insensitive.
00349 
00350     >>> c = Contentline('key;param=pvalue:value', strict=True)
00351     >>> c.parts()
00352     ('key', Parameters({'PARAM': 'PVALUE'}), 'value')
00353 
00354     >>> c = Contentline('key;param="pValue":value', strict=True)
00355     >>> c.parts()
00356     ('key', Parameters({'PARAM': 'pValue'}), 'value')
00357     """
00358 
00359     def __new__(cls, st, strict=False):
00360         self = str.__new__(cls, st)
00361         setattr(self, 'strict', strict)
00362         return self
00363 
00364     def from_parts(parts):
00365         "Turns a tuple of parts into a content line"
00366         (name, params, values) = [str(p) for p in parts]
00367         try:
00368             if params:
00369                 return Contentline('%s;%s:%s' % (name, params, values))
00370             return Contentline('%s:%s' %  (name, values))
00371         except:
00372             raise ValueError(
00373                 'Property: %s Wrong values "%s" or "%s"' % (repr(name),
00374                                                             repr(params),
00375                                                             repr(values)))
00376     from_parts = staticmethod(from_parts)
00377 
00378     def parts(self):
00379         """ Splits the content line up into (name, parameters, values) parts
00380         """
00381         try:
00382             name_split = None
00383             value_split = None
00384             inquotes = 0
00385             for i in range(len(self)):
00386                 ch = self[i]
00387                 if not inquotes:
00388                     if ch in ':;' and not name_split:
00389                         name_split = i
00390                     if ch == ':' and not value_split:
00391                         value_split = i
00392                 if ch == '"':
00393                     inquotes = not inquotes
00394             name = self[:name_split]
00395             if not name:
00396                 raise ValueError, 'Key name is required'
00397             validate_token(name)
00398             if name_split+1 == value_split:
00399                 raise ValueError, 'Invalid content line'
00400             params = Parameters.from_string(self[name_split+1:value_split],
00401                                             strict=self.strict)
00402             values = self[value_split+1:]
00403             return (name, params, values)
00404         except:
00405             raise ValueError, 'Content line could not be parsed into parts'
00406 
00407     def from_string(st, strict=False):
00408         "Unfolds the content lines in an iCalendar into long content lines"
00409         try:
00410             # a fold is carriage return followed by either a space or a tab
00411             return Contentline(FOLD.sub('', st), strict=strict)
00412         except:
00413             raise ValueError, 'Expected StringType with content line'
00414     from_string = staticmethod(from_string)
00415 
00416     def __str__(self):
00417         "Long content lines are folded so they are less than 75 characters wide"
00418         l_line = len(self)
00419         new_lines = []
00420         for i in range(0, l_line, 74):
00421             new_lines.append(self[i:i+74])
00422         return '\r\n '.join(new_lines)
00423 
00424 
00425 
00426 class Contentlines(list):
00427     """
00428     I assume that iCalendar files generally are a few kilobytes in size. Then
00429     this should be efficient. for Huge files, an iterator should probably be
00430     used instead.
00431 
00432     >>> c = Contentlines([Contentline('BEGIN:VEVENT\\r\\n')])
00433     >>> str(c)
00434     'BEGIN:VEVENT\\r\\n'
00435 
00436     Lets try appending it with a 100 charater wide string
00437     >>> c.append(Contentline(''.join(['123456789 ']*10)+'\\r\\n'))
00438     >>> str(c)
00439     'BEGIN:VEVENT\\r\\n\\r\\n123456789 123456789 123456789 123456789 123456789 123456789 123456789 1234\\r\\n 56789 123456789 123456789 \\r\\n'
00440 
00441     Notice that there is an extra empty string in the end of the content lines.
00442     That is so they can be easily joined with: '\r\n'.join(contentlines)).
00443     >>> Contentlines.from_string('A short line\\r\\n')
00444     ['A short line', '']
00445     >>> Contentlines.from_string('A faked\\r\\n  long line\\r\\n')
00446     ['A faked long line', '']
00447     >>> Contentlines.from_string('A faked\\r\\n  long line\\r\\nAnd another lin\\r\\n\\te that is folded\\r\\n')
00448     ['A faked long line', 'And another line that is folded', '']
00449     """
00450 
00451     def __str__(self):
00452         "Simply join self."
00453         return '\r\n'.join(map(str, self))
00454 
00455     def from_string(st):
00456         "Parses a string into content lines"
00457         try:
00458             # a fold is carriage return followed by either a space or a tab
00459             unfolded = FOLD.sub('', st)
00460             lines = [Contentline(line) for line in unfolded.splitlines() if line]
00461             lines.append('') # we need a '\r\n' in the end of every content line
00462             return Contentlines(lines)
00463         except:
00464             raise ValueError, 'Expected StringType with content lines'
00465     from_string = staticmethod(from_string)
00466 
00467 
00468 # ran this:
00469 #    sample = open('./samples/test.ics', 'rb').read() # binary file in windows!
00470 #    lines = Contentlines.from_string(sample)
00471 #    for line in lines[:-1]:
00472 #        print line.parts()
00473 
00474 # got this:
00475 #('BEGIN', Parameters({}), 'VCALENDAR')
00476 #('METHOD', Parameters({}), 'Request')
00477 #('PRODID', Parameters({}), '-//My product//mxm.dk/')
00478 #('VERSION', Parameters({}), '2.0')
00479 #('BEGIN', Parameters({}), 'VEVENT')
00480 #('DESCRIPTION', Parameters({}), 'This is a very long description that ...')
00481 #('PARTICIPANT', Parameters({'CN': 'Max M'}), 'MAILTO:maxm@mxm.dk')
00482 #('DTEND', Parameters({}), '20050107T160000')
00483 #('DTSTART', Parameters({}), '20050107T120000')
00484 #('SUMMARY', Parameters({}), 'A second event')
00485 #('END', Parameters({}), 'VEVENT')
00486 #('BEGIN', Parameters({}), 'VEVENT')
00487 #('DTEND', Parameters({}), '20050108T235900')
00488 #('DTSTART', Parameters({}), '20050108T230000')
00489 #('SUMMARY', Parameters({}), 'A single event')
00490 #('UID', Parameters({}), '42')
00491 #('END', Parameters({}), 'VEVENT')
00492 #('END', Parameters({}), 'VCALENDAR')