Back to index

python3.2  3.2.2
_markupbase.py
Go to the documentation of this file.
00001 """Shared support for scanning document type declarations in HTML and XHTML.
00002 
00003 This module is used as a foundation for the html.parser module.  It has no
00004 documented public API and should not be used directly.
00005 
00006 """
00007 
00008 import re
00009 
00010 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
00011 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
00012 _commentclose = re.compile(r'--\s*>')
00013 _markedsectionclose = re.compile(r']\s*]\s*>')
00014 
00015 # An analysis of the MS-Word extensions is available at
00016 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
00017 
00018 _msmarkedsectionclose = re.compile(r']\s*>')
00019 
00020 del re
00021 
00022 
00023 class ParserBase:
00024     """Parser base class which provides some common support methods used
00025     by the SGML/HTML and XHTML parsers."""
00026 
00027     def __init__(self):
00028         if self.__class__ is ParserBase:
00029             raise RuntimeError(
00030                 "_markupbase.ParserBase must be subclassed")
00031 
00032     def error(self, message):
00033         raise NotImplementedError(
00034             "subclasses of ParserBase must override error()")
00035 
00036     def reset(self):
00037         self.lineno = 1
00038         self.offset = 0
00039 
00040     def getpos(self):
00041         """Return current line number and offset."""
00042         return self.lineno, self.offset
00043 
00044     # Internal -- update line number and offset.  This should be
00045     # called for each piece of data exactly once, in order -- in other
00046     # words the concatenation of all the input strings to this
00047     # function should be exactly the entire input.
00048     def updatepos(self, i, j):
00049         if i >= j:
00050             return j
00051         rawdata = self.rawdata
00052         nlines = rawdata.count("\n", i, j)
00053         if nlines:
00054             self.lineno = self.lineno + nlines
00055             pos = rawdata.rindex("\n", i, j) # Should not fail
00056             self.offset = j-(pos+1)
00057         else:
00058             self.offset = self.offset + j-i
00059         return j
00060 
00061     _decl_otherchars = ''
00062 
00063     # Internal -- parse declaration (for use by subclasses).
00064     def parse_declaration(self, i):
00065         # This is some sort of declaration; in "HTML as
00066         # deployed," this should only be the document type
00067         # declaration ("<!DOCTYPE html...>").
00068         # ISO 8879:1986, however, has more complex
00069         # declaration syntax for elements in <!...>, including:
00070         # --comment--
00071         # [marked section]
00072         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
00073         # ATTLIST, NOTATION, SHORTREF, USEMAP,
00074         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
00075         rawdata = self.rawdata
00076         j = i + 2
00077         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
00078         if rawdata[j:j+1] == ">":
00079             # the empty comment <!>
00080             return j + 1
00081         if rawdata[j:j+1] in ("-", ""):
00082             # Start of comment followed by buffer boundary,
00083             # or just a buffer boundary.
00084             return -1
00085         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
00086         n = len(rawdata)
00087         if rawdata[j:j+2] == '--': #comment
00088             # Locate --.*-- as the body of the comment
00089             return self.parse_comment(i)
00090         elif rawdata[j] == '[': #marked section
00091             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
00092             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
00093             # Note that this is extended by Microsoft Office "Save as Web" function
00094             # to include [if...] and [endif].
00095             return self.parse_marked_section(i)
00096         else: #all other declaration elements
00097             decltype, j = self._scan_name(j, i)
00098         if j < 0:
00099             return j
00100         if decltype == "doctype":
00101             self._decl_otherchars = ''
00102         while j < n:
00103             c = rawdata[j]
00104             if c == ">":
00105                 # end of declaration syntax
00106                 data = rawdata[i+2:j]
00107                 if decltype == "doctype":
00108                     self.handle_decl(data)
00109                 else:
00110                     self.unknown_decl(data)
00111                 return j + 1
00112             if c in "\"'":
00113                 m = _declstringlit_match(rawdata, j)
00114                 if not m:
00115                     return -1 # incomplete
00116                 j = m.end()
00117             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
00118                 name, j = self._scan_name(j, i)
00119             elif c in self._decl_otherchars:
00120                 j = j + 1
00121             elif c == "[":
00122                 # this could be handled in a separate doctype parser
00123                 if decltype == "doctype":
00124                     j = self._parse_doctype_subset(j + 1, i)
00125                 elif decltype in {"attlist", "linktype", "link", "element"}:
00126                     # must tolerate []'d groups in a content model in an element declaration
00127                     # also in data attribute specifications of attlist declaration
00128                     # also link type declaration subsets in linktype declarations
00129                     # also link attribute specification lists in link declarations
00130                     self.error("unsupported '[' char in %s declaration" % decltype)
00131                 else:
00132                     self.error("unexpected '[' char in declaration")
00133             else:
00134                 self.error(
00135                     "unexpected %r char in declaration" % rawdata[j])
00136             if j < 0:
00137                 return j
00138         return -1 # incomplete
00139 
00140     # Internal -- parse a marked section
00141     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
00142     def parse_marked_section(self, i, report=1):
00143         rawdata= self.rawdata
00144         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
00145         sectName, j = self._scan_name( i+3, i )
00146         if j < 0:
00147             return j
00148         if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
00149             # look for standard ]]> ending
00150             match= _markedsectionclose.search(rawdata, i+3)
00151         elif sectName in {"if", "else", "endif"}:
00152             # look for MS Office ]> ending
00153             match= _msmarkedsectionclose.search(rawdata, i+3)
00154         else:
00155             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
00156         if not match:
00157             return -1
00158         if report:
00159             j = match.start(0)
00160             self.unknown_decl(rawdata[i+3: j])
00161         return match.end(0)
00162 
00163     # Internal -- parse comment, return length or -1 if not terminated
00164     def parse_comment(self, i, report=1):
00165         rawdata = self.rawdata
00166         if rawdata[i:i+4] != '<!--':
00167             self.error('unexpected call to parse_comment()')
00168         match = _commentclose.search(rawdata, i+4)
00169         if not match:
00170             return -1
00171         if report:
00172             j = match.start(0)
00173             self.handle_comment(rawdata[i+4: j])
00174         return match.end(0)
00175 
00176     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
00177     # returning the index just past any whitespace following the trailing ']'.
00178     def _parse_doctype_subset(self, i, declstartpos):
00179         rawdata = self.rawdata
00180         n = len(rawdata)
00181         j = i
00182         while j < n:
00183             c = rawdata[j]
00184             if c == "<":
00185                 s = rawdata[j:j+2]
00186                 if s == "<":
00187                     # end of buffer; incomplete
00188                     return -1
00189                 if s != "<!":
00190                     self.updatepos(declstartpos, j + 1)
00191                     self.error("unexpected char in internal subset (in %r)" % s)
00192                 if (j + 2) == n:
00193                     # end of buffer; incomplete
00194                     return -1
00195                 if (j + 4) > n:
00196                     # end of buffer; incomplete
00197                     return -1
00198                 if rawdata[j:j+4] == "<!--":
00199                     j = self.parse_comment(j, report=0)
00200                     if j < 0:
00201                         return j
00202                     continue
00203                 name, j = self._scan_name(j + 2, declstartpos)
00204                 if j == -1:
00205                     return -1
00206                 if name not in {"attlist", "element", "entity", "notation"}:
00207                     self.updatepos(declstartpos, j + 2)
00208                     self.error(
00209                         "unknown declaration %r in internal subset" % name)
00210                 # handle the individual names
00211                 meth = getattr(self, "_parse_doctype_" + name)
00212                 j = meth(j, declstartpos)
00213                 if j < 0:
00214                     return j
00215             elif c == "%":
00216                 # parameter entity reference
00217                 if (j + 1) == n:
00218                     # end of buffer; incomplete
00219                     return -1
00220                 s, j = self._scan_name(j + 1, declstartpos)
00221                 if j < 0:
00222                     return j
00223                 if rawdata[j] == ";":
00224                     j = j + 1
00225             elif c == "]":
00226                 j = j + 1
00227                 while j < n and rawdata[j].isspace():
00228                     j = j + 1
00229                 if j < n:
00230                     if rawdata[j] == ">":
00231                         return j
00232                     self.updatepos(declstartpos, j)
00233                     self.error("unexpected char after internal subset")
00234                 else:
00235                     return -1
00236             elif c.isspace():
00237                 j = j + 1
00238             else:
00239                 self.updatepos(declstartpos, j)
00240                 self.error("unexpected char %r in internal subset" % c)
00241         # end of buffer reached
00242         return -1
00243 
00244     # Internal -- scan past <!ELEMENT declarations
00245     def _parse_doctype_element(self, i, declstartpos):
00246         name, j = self._scan_name(i, declstartpos)
00247         if j == -1:
00248             return -1
00249         # style content model; just skip until '>'
00250         rawdata = self.rawdata
00251         if '>' in rawdata[j:]:
00252             return rawdata.find(">", j) + 1
00253         return -1
00254 
00255     # Internal -- scan past <!ATTLIST declarations
00256     def _parse_doctype_attlist(self, i, declstartpos):
00257         rawdata = self.rawdata
00258         name, j = self._scan_name(i, declstartpos)
00259         c = rawdata[j:j+1]
00260         if c == "":
00261             return -1
00262         if c == ">":
00263             return j + 1
00264         while 1:
00265             # scan a series of attribute descriptions; simplified:
00266             #   name type [value] [#constraint]
00267             name, j = self._scan_name(j, declstartpos)
00268             if j < 0:
00269                 return j
00270             c = rawdata[j:j+1]
00271             if c == "":
00272                 return -1
00273             if c == "(":
00274                 # an enumerated type; look for ')'
00275                 if ")" in rawdata[j:]:
00276                     j = rawdata.find(")", j) + 1
00277                 else:
00278                     return -1
00279                 while rawdata[j:j+1].isspace():
00280                     j = j + 1
00281                 if not rawdata[j:]:
00282                     # end of buffer, incomplete
00283                     return -1
00284             else:
00285                 name, j = self._scan_name(j, declstartpos)
00286             c = rawdata[j:j+1]
00287             if not c:
00288                 return -1
00289             if c in "'\"":
00290                 m = _declstringlit_match(rawdata, j)
00291                 if m:
00292                     j = m.end()
00293                 else:
00294                     return -1
00295                 c = rawdata[j:j+1]
00296                 if not c:
00297                     return -1
00298             if c == "#":
00299                 if rawdata[j:] == "#":
00300                     # end of buffer
00301                     return -1
00302                 name, j = self._scan_name(j + 1, declstartpos)
00303                 if j < 0:
00304                     return j
00305                 c = rawdata[j:j+1]
00306                 if not c:
00307                     return -1
00308             if c == '>':
00309                 # all done
00310                 return j + 1
00311 
00312     # Internal -- scan past <!NOTATION declarations
00313     def _parse_doctype_notation(self, i, declstartpos):
00314         name, j = self._scan_name(i, declstartpos)
00315         if j < 0:
00316             return j
00317         rawdata = self.rawdata
00318         while 1:
00319             c = rawdata[j:j+1]
00320             if not c:
00321                 # end of buffer; incomplete
00322                 return -1
00323             if c == '>':
00324                 return j + 1
00325             if c in "'\"":
00326                 m = _declstringlit_match(rawdata, j)
00327                 if not m:
00328                     return -1
00329                 j = m.end()
00330             else:
00331                 name, j = self._scan_name(j, declstartpos)
00332                 if j < 0:
00333                     return j
00334 
00335     # Internal -- scan past <!ENTITY declarations
00336     def _parse_doctype_entity(self, i, declstartpos):
00337         rawdata = self.rawdata
00338         if rawdata[i:i+1] == "%":
00339             j = i + 1
00340             while 1:
00341                 c = rawdata[j:j+1]
00342                 if not c:
00343                     return -1
00344                 if c.isspace():
00345                     j = j + 1
00346                 else:
00347                     break
00348         else:
00349             j = i
00350         name, j = self._scan_name(j, declstartpos)
00351         if j < 0:
00352             return j
00353         while 1:
00354             c = self.rawdata[j:j+1]
00355             if not c:
00356                 return -1
00357             if c in "'\"":
00358                 m = _declstringlit_match(rawdata, j)
00359                 if m:
00360                     j = m.end()
00361                 else:
00362                     return -1    # incomplete
00363             elif c == ">":
00364                 return j + 1
00365             else:
00366                 name, j = self._scan_name(j, declstartpos)
00367                 if j < 0:
00368                     return j
00369 
00370     # Internal -- scan a name token and the new position and the token, or
00371     # return -1 if we've reached the end of the buffer.
00372     def _scan_name(self, i, declstartpos):
00373         rawdata = self.rawdata
00374         n = len(rawdata)
00375         if i == n:
00376             return None, -1
00377         m = _declname_match(rawdata, i)
00378         if m:
00379             s = m.group()
00380             name = s.strip()
00381             if (i + len(s)) == n:
00382                 return None, -1  # end of buffer
00383             return name.lower(), m.end()
00384         else:
00385             self.updatepos(declstartpos, i)
00386             self.error("expected name token at %r"
00387                        % rawdata[declstartpos:declstartpos+20])
00388 
00389     # To be overridden -- handlers for unknown objects
00390     def unknown_decl(self, data):
00391         pass