Back to index

moin  1.9.0~rc2
text_html_text_moin_wiki.py
Go to the documentation of this file.
00001 """
00002     MoinMoin - convert from html to wiki markup
00003 
00004     @copyright: 2005-2006 Bastian Blank, Florian Festi, Reimar Bauer,
00005                 2005-2007 MoinMoin:ThomasWaldmann
00006     @license: GNU GPL, see COPYING for details.
00007 """
00008 
00009 import re, os
00010 import xml.dom.minidom # HINT: the nodes in parse result tree need .has_key(), "x in ..." does not work
00011 import urlparse
00012 from xml.dom import Node
00013 
00014 from MoinMoin import config, wikiutil
00015 from MoinMoin.error import ConvertError
00016 from werkzeug.utils import url_decode
00017 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
00018 interwiki_re = re.compile(WikiParser.interwiki_rule, re.VERBOSE|re.UNICODE)
00019 
00020 
00021 # Portions (C) International Organization for Standardization 1986
00022 # Permission to copy in any form is granted for use with
00023 # conforming SGML systems and applications as defined in
00024 # ISO 8879, provided this notice is included in all copies.
00025 dtd = ur'''
00026 <!DOCTYPE html [
00027 <!ENTITY nbsp   "&#32;">  <!-- no-break space = non-breaking space, U+00A0, convert to U+0020 -->
00028 <!ENTITY iexcl  "&#161;"> <!-- inverted exclamation mark, U+00A1 ISOnum -->
00029 <!ENTITY cent   "&#162;"> <!-- cent sign, U+00A2 ISOnum -->
00030 <!ENTITY pound  "&#163;"> <!-- pound sign, U+00A3 ISOnum -->
00031 <!ENTITY curren "&#164;"> <!-- currency sign, U+00A4 ISOnum -->
00032 <!ENTITY yen    "&#165;"> <!-- yen sign = yuan sign, U+00A5 ISOnum -->
00033 <!ENTITY brvbar "&#166;"> <!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
00034 <!ENTITY sect   "&#167;"> <!-- section sign, U+00A7 ISOnum -->
00035 <!ENTITY uml    "&#168;"> <!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
00036 <!ENTITY copy   "&#169;"> <!-- copyright sign, U+00A9 ISOnum -->
00037 <!ENTITY ordf   "&#170;"> <!-- feminine ordinal indicator, U+00AA ISOnum -->
00038 <!ENTITY laquo  "&#171;"> <!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
00039 <!ENTITY not    "&#172;"> <!-- not sign = angled dash, U+00AC ISOnum -->
00040 <!ENTITY shy    "&#173;"> <!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
00041 <!ENTITY reg    "&#174;"> <!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
00042 <!ENTITY macr   "&#175;"> <!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
00043 <!ENTITY deg    "&#176;"> <!-- degree sign, U+00B0 ISOnum -->
00044 <!ENTITY plusmn "&#177;"> <!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
00045 <!ENTITY sup2   "&#178;"> <!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
00046 <!ENTITY sup3   "&#179;"> <!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
00047 <!ENTITY acute  "&#180;"> <!-- acute accent = spacing acute, U+00B4 ISOdia -->
00048 <!ENTITY micro  "&#181;"> <!-- micro sign, U+00B5 ISOnum -->
00049 <!ENTITY para   "&#182;"> <!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
00050 <!ENTITY middot "&#183;"> <!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
00051 <!ENTITY cedil  "&#184;"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
00052 <!ENTITY sup1   "&#185;"> <!-- superscript one = superscript digit one, U+00B9 ISOnum -->
00053 <!ENTITY ordm   "&#186;"> <!-- masculine ordinal indicator, U+00BA ISOnum -->
00054 <!ENTITY raquo  "&#187;"> <!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
00055 <!ENTITY frac14 "&#188;"> <!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
00056 <!ENTITY frac12 "&#189;"> <!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
00057 <!ENTITY frac34 "&#190;"> <!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
00058 <!ENTITY iquest "&#191;"> <!-- inverted question mark = turned question mark, U+00BF ISOnum -->
00059 <!ENTITY Agrave "&#192;"> <!-- latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 -->
00060 <!ENTITY Aacute "&#193;"> <!-- latin capital letter A with acute, U+00C1 ISOlat1 -->
00061 <!ENTITY Acirc  "&#194;"> <!-- latin capital letter A with circumflex, U+00C2 ISOlat1 -->
00062 <!ENTITY Atilde "&#195;"> <!-- latin capital letter A with tilde, U+00C3 ISOlat1 -->
00063 <!ENTITY Auml   "&#196;"> <!-- latin capital letter A with diaeresis, U+00C4 ISOlat1 -->
00064 <!ENTITY Aring  "&#197;"> <!-- latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 -->
00065 <!ENTITY AElig  "&#198;"> <!-- latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 -->
00066 <!ENTITY Ccedil "&#199;"> <!-- latin capital letter C with cedilla, U+00C7 ISOlat1 -->
00067 <!ENTITY Egrave "&#200;"> <!-- latin capital letter E with grave, U+00C8 ISOlat1 -->
00068 <!ENTITY Eacute "&#201;"> <!-- latin capital letter E with acute, U+00C9 ISOlat1 -->
00069 <!ENTITY Ecirc  "&#202;"> <!-- latin capital letter E with circumflex, U+00CA ISOlat1 -->
00070 <!ENTITY Euml   "&#203;"> <!-- latin capital letter E with diaeresis, U+00CB ISOlat1 -->
00071 <!ENTITY Igrave "&#204;"> <!-- latin capital letter I with grave, U+00CC ISOlat1 -->
00072 <!ENTITY Iacute "&#205;"> <!-- latin capital letter I with acute, U+00CD ISOlat1 -->
00073 <!ENTITY Icirc  "&#206;"> <!-- latin capital letter I with circumflex, U+00CE ISOlat1 -->
00074 <!ENTITY Iuml   "&#207;"> <!-- latin capital letter I with diaeresis, U+00CF ISOlat1 -->
00075 <!ENTITY ETH    "&#208;"> <!-- latin capital letter ETH, U+00D0 ISOlat1 -->
00076 <!ENTITY Ntilde "&#209;"> <!-- latin capital letter N with tilde, U+00D1 ISOlat1 -->
00077 <!ENTITY Ograve "&#210;"> <!-- latin capital letter O with grave, U+00D2 ISOlat1 -->
00078 <!ENTITY Oacute "&#211;"> <!-- latin capital letter O with acute, U+00D3 ISOlat1 -->
00079 <!ENTITY Ocirc  "&#212;"> <!-- latin capital letter O with circumflex, U+00D4 ISOlat1 -->
00080 <!ENTITY Otilde "&#213;"> <!-- latin capital letter O with tilde, U+00D5 ISOlat1 -->
00081 <!ENTITY Ouml   "&#214;"> <!-- latin capital letter O with diaeresis, U+00D6 ISOlat1 -->
00082 <!ENTITY times  "&#215;"> <!-- multiplication sign, U+00D7 ISOnum -->
00083 <!ENTITY Oslash "&#216;"> <!-- latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 -->
00084 <!ENTITY Ugrave "&#217;"> <!-- latin capital letter U with grave, U+00D9 ISOlat1 -->
00085 <!ENTITY Uacute "&#218;"> <!-- latin capital letter U with acute, U+00DA ISOlat1 -->
00086 <!ENTITY Ucirc  "&#219;"> <!-- latin capital letter U with circumflex, U+00DB ISOlat1 -->
00087 <!ENTITY Uuml   "&#220;"> <!-- latin capital letter U with diaeresis, U+00DC ISOlat1 -->
00088 <!ENTITY Yacute "&#221;"> <!-- latin capital letter Y with acute, U+00DD ISOlat1 -->
00089 <!ENTITY THORN  "&#222;"> <!-- latin capital letter THORN, U+00DE ISOlat1 -->
00090 <!ENTITY szlig  "&#223;"> <!-- latin small letter sharp s = ess-zed, U+00DF ISOlat1 -->
00091 <!ENTITY agrave "&#224;"> <!-- latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 -->
00092 <!ENTITY aacute "&#225;"> <!-- latin small letter a with acute, U+00E1 ISOlat1 -->
00093 <!ENTITY acirc  "&#226;"> <!-- latin small letter a with circumflex, U+00E2 ISOlat1 -->
00094 <!ENTITY atilde "&#227;"> <!-- latin small letter a with tilde, U+00E3 ISOlat1 -->
00095 <!ENTITY auml   "&#228;"> <!-- latin small letter a with diaeresis, U+00E4 ISOlat1 -->
00096 <!ENTITY aring  "&#229;"> <!-- latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 -->
00097 <!ENTITY aelig  "&#230;"> <!-- latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 -->
00098 <!ENTITY ccedil "&#231;"> <!-- latin small letter c with cedilla, U+00E7 ISOlat1 -->
00099 <!ENTITY egrave "&#232;"> <!-- latin small letter e with grave, U+00E8 ISOlat1 -->
00100 <!ENTITY eacute "&#233;"> <!-- latin small letter e with acute, U+00E9 ISOlat1 -->
00101 <!ENTITY ecirc  "&#234;"> <!-- latin small letter e with circumflex, U+00EA ISOlat1 -->
00102 <!ENTITY euml   "&#235;"> <!-- latin small letter e with diaeresis, U+00EB ISOlat1 -->
00103 <!ENTITY igrave "&#236;"> <!-- latin small letter i with grave, U+00EC ISOlat1 -->
00104 <!ENTITY iacute "&#237;"> <!-- latin small letter i with acute, U+00ED ISOlat1 -->
00105 <!ENTITY icirc  "&#238;"> <!-- latin small letter i with circumflex, U+00EE ISOlat1 -->
00106 <!ENTITY iuml   "&#239;"> <!-- latin small letter i with diaeresis, U+00EF ISOlat1 -->
00107 <!ENTITY eth    "&#240;"> <!-- latin small letter eth, U+00F0 ISOlat1 -->
00108 <!ENTITY ntilde "&#241;"> <!-- latin small letter n with tilde, U+00F1 ISOlat1 -->
00109 <!ENTITY ograve "&#242;"> <!-- latin small letter o with grave, U+00F2 ISOlat1 -->
00110 <!ENTITY oacute "&#243;"> <!-- latin small letter o with acute, U+00F3 ISOlat1 -->
00111 <!ENTITY ocirc  "&#244;"> <!-- latin small letter o with circumflex, U+00F4 ISOlat1 -->
00112 <!ENTITY otilde "&#245;"> <!-- latin small letter o with tilde, U+00F5 ISOlat1 -->
00113 <!ENTITY ouml   "&#246;"> <!-- latin small letter o with diaeresis, U+00F6 ISOlat1 -->
00114 <!ENTITY divide "&#247;"> <!-- division sign, U+00F7 ISOnum -->
00115 <!ENTITY oslash "&#248;"> <!-- latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 -->
00116 <!ENTITY ugrave "&#249;"> <!-- latin small letter u with grave, U+00F9 ISOlat1 -->
00117 <!ENTITY uacute "&#250;"> <!-- latin small letter u with acute, U+00FA ISOlat1 -->
00118 <!ENTITY ucirc  "&#251;"> <!-- latin small letter u with circumflex, U+00FB ISOlat1 -->
00119 <!ENTITY uuml   "&#252;"> <!-- latin small letter u with diaeresis, U+00FC ISOlat1 -->
00120 <!ENTITY yacute "&#253;"> <!-- latin small letter y with acute, U+00FD ISOlat1 -->
00121 <!ENTITY thorn  "&#254;"> <!-- latin small letter thorn, U+00FE ISOlat1 -->
00122 <!ENTITY yuml   "&#255;"> <!-- latin small letter y with diaeresis, U+00FF ISOlat1 -->
00123 
00124 <!-- Latin Extended-B -->
00125 <!ENTITY fnof     "&#402;"> <!-- latin small f with hook = function                                    = florin, U+0192 ISOtech -->
00126 
00127 <!-- Greek -->
00128 <!ENTITY Alpha    "&#913;"> <!-- greek capital letter alpha, U+0391 -->
00129 <!ENTITY Beta     "&#914;"> <!-- greek capital letter beta, U+0392 -->
00130 <!ENTITY Gamma    "&#915;"> <!-- greek capital letter gamma,
00131                                     U+0393 ISOgrk3 -->
00132 <!ENTITY Delta    "&#916;"> <!-- greek capital letter delta,
00133                                     U+0394 ISOgrk3 -->
00134 <!ENTITY Epsilon  "&#917;"> <!-- greek capital letter epsilon, U+0395 -->
00135 <!ENTITY Zeta     "&#918;"> <!-- greek capital letter zeta, U+0396 -->
00136 <!ENTITY Eta      "&#919;"> <!-- greek capital letter eta, U+0397 -->
00137 <!ENTITY Theta    "&#920;"> <!-- greek capital letter theta,
00138                                     U+0398 ISOgrk3 -->
00139 <!ENTITY Iota     "&#921;"> <!-- greek capital letter iota, U+0399 -->
00140 <!ENTITY Kappa    "&#922;"> <!-- greek capital letter kappa, U+039A -->
00141 <!ENTITY Lambda   "&#923;"> <!-- greek capital letter lambda,
00142                                     U+039B ISOgrk3 -->
00143 <!ENTITY Mu       "&#924;"> <!-- greek capital letter mu, U+039C -->
00144 <!ENTITY Nu       "&#925;"> <!-- greek capital letter nu, U+039D -->
00145 <!ENTITY Xi       "&#926;"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
00146 <!ENTITY Omicron  "&#927;"> <!-- greek capital letter omicron, U+039F -->
00147 <!ENTITY Pi       "&#928;"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
00148 <!ENTITY Rho      "&#929;"> <!-- greek capital letter rho, U+03A1 -->
00149 <!-- there is no Sigmaf, and no U+03A2 character either -->
00150 <!ENTITY Sigma    "&#931;"> <!-- greek capital letter sigma,
00151                                     U+03A3 ISOgrk3 -->
00152 <!ENTITY Tau      "&#932;"> <!-- greek capital letter tau, U+03A4 -->
00153 <!ENTITY Upsilon  "&#933;"> <!-- greek capital letter upsilon,
00154                                     U+03A5 ISOgrk3 -->
00155 <!ENTITY Phi      "&#934;"> <!-- greek capital letter phi,
00156                                     U+03A6 ISOgrk3 -->
00157 <!ENTITY Chi      "&#935;"> <!-- greek capital letter chi, U+03A7 -->
00158 <!ENTITY Psi      "&#936;"> <!-- greek capital letter psi,
00159                                     U+03A8 ISOgrk3 -->
00160 <!ENTITY Omega    "&#937;"> <!-- greek capital letter omega,
00161                                     U+03A9 ISOgrk3 -->
00162 
00163 <!ENTITY alpha    "&#945;"> <!-- greek small letter alpha,
00164                                     U+03B1 ISOgrk3 -->
00165 <!ENTITY beta     "&#946;"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
00166 <!ENTITY gamma    "&#947;"> <!-- greek small letter gamma,
00167                                     U+03B3 ISOgrk3 -->
00168 <!ENTITY delta    "&#948;"> <!-- greek small letter delta,
00169                                     U+03B4 ISOgrk3 -->
00170 <!ENTITY epsilon  "&#949;"> <!-- greek small letter epsilon,
00171                                     U+03B5 ISOgrk3 -->
00172 <!ENTITY zeta     "&#950;"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
00173 <!ENTITY eta      "&#951;"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
00174 <!ENTITY theta    "&#952;"> <!-- greek small letter theta,
00175                                     U+03B8 ISOgrk3 -->
00176 <!ENTITY iota     "&#953;"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
00177 <!ENTITY kappa    "&#954;"> <!-- greek small letter kappa,
00178                                     U+03BA ISOgrk3 -->
00179 <!ENTITY lambda   "&#955;"> <!-- greek small letter lambda,
00180                                     U+03BB ISOgrk3 -->
00181 <!ENTITY mu       "&#956;"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
00182 <!ENTITY nu       "&#957;"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
00183 <!ENTITY xi       "&#958;"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
00184 <!ENTITY omicron  "&#959;"> <!-- greek small letter omicron, U+03BF NEW -->
00185 <!ENTITY pi       "&#960;"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
00186 <!ENTITY rho      "&#961;"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
00187 <!ENTITY sigmaf   "&#962;"> <!-- greek small letter final sigma,
00188                                     U+03C2 ISOgrk3 -->
00189 <!ENTITY sigma    "&#963;"> <!-- greek small letter sigma,
00190                                     U+03C3 ISOgrk3 -->
00191 <!ENTITY tau      "&#964;"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
00192 <!ENTITY upsilon  "&#965;"> <!-- greek small letter upsilon,
00193                                     U+03C5 ISOgrk3 -->
00194 <!ENTITY phi      "&#966;"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
00195 <!ENTITY chi      "&#967;"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
00196 <!ENTITY psi      "&#968;"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
00197 <!ENTITY omega    "&#969;"> <!-- greek small letter omega,
00198                                     U+03C9 ISOgrk3 -->
00199 <!ENTITY thetasym "&#977;"> <!-- greek small letter theta symbol,
00200                                     U+03D1 NEW -->
00201 <!ENTITY upsih    "&#978;"> <!-- greek upsilon with hook symbol,
00202                                     U+03D2 NEW -->
00203 <!ENTITY piv      "&#982;"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
00204 
00205 <!-- General Punctuation -->
00206 <!ENTITY bull     "&#8226;"> <!-- bullet = black small circle,
00207                                      U+2022 ISOpub  -->
00208 <!-- bullet is NOT the same as bullet operator, U+2219 -->
00209 <!ENTITY hellip   "&#8230;"> <!-- horizontal ellipsis = three dot leader,
00210                                      U+2026 ISOpub  -->
00211 <!ENTITY prime    "&#8242;"> <!-- prime = minutes = feet, U+2032 ISOtech -->
00212 <!ENTITY Prime    "&#8243;"> <!-- double prime = seconds = inches,
00213                                      U+2033 ISOtech -->
00214 <!ENTITY oline    "&#8254;"> <!-- overline = spacing overscore,
00215                                      U+203E NEW -->
00216 <!ENTITY frasl    "&#8260;"> <!-- fraction slash, U+2044 NEW -->
00217 
00218 <!-- Letterlike Symbols -->
00219 <!ENTITY weierp   "&#8472;"> <!-- script capital P = power set
00220                                      = Weierstrass p, U+2118 ISOamso -->
00221 <!ENTITY image    "&#8465;"> <!-- blackletter capital I = imaginary part,
00222                                      U+2111 ISOamso -->
00223 <!ENTITY real     "&#8476;"> <!-- blackletter capital R = real part symbol,
00224                                      U+211C ISOamso -->
00225 <!ENTITY trade    "&#8482;"> <!-- trade mark sign, U+2122 ISOnum -->
00226 <!ENTITY alefsym  "&#8501;"> <!-- alef symbol = first transfinite cardinal,
00227                                      U+2135 NEW -->
00228 <!-- alef symbol is NOT the same as hebrew letter alef,
00229      U+05D0 although the same glyph could be used to depict both characters -->
00230 
00231 <!-- Arrows -->
00232 <!ENTITY larr     "&#8592;"> <!-- leftwards arrow, U+2190 ISOnum -->
00233 <!ENTITY uarr     "&#8593;"> <!-- upwards arrow, U+2191 ISOnum-->
00234 <!ENTITY rarr     "&#8594;"> <!-- rightwards arrow, U+2192 ISOnum -->
00235 <!ENTITY darr     "&#8595;"> <!-- downwards arrow, U+2193 ISOnum -->
00236 <!ENTITY harr     "&#8596;"> <!-- left right arrow, U+2194 ISOamsa -->
00237 <!ENTITY crarr    "&#8629;"> <!-- downwards arrow with corner leftwards
00238                                      = carriage return, U+21B5 NEW -->
00239 <!ENTITY lArr     "&#8656;"> <!-- leftwards double arrow, U+21D0 ISOtech -->
00240 <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
00241     but also does not have any other character for that function. So ? lArr can
00242     be used for 'is implied by' as ISOtech suggests -->
00243 <!ENTITY uArr     "&#8657;"> <!-- upwards double arrow, U+21D1 ISOamsa -->
00244 <!ENTITY rArr     "&#8658;"> <!-- rightwards double arrow,
00245                                      U+21D2 ISOtech -->
00246 <!-- ISO 10646 does not say this is the 'implies' character but does not have
00247      another character with this function so ?
00248      rArr can be used for 'implies' as ISOtech suggests -->
00249 <!ENTITY dArr     "&#8659;"> <!-- downwards double arrow, U+21D3 ISOamsa -->
00250 <!ENTITY hArr     "&#8660;"> <!-- left right double arrow,
00251                                      U+21D4 ISOamsa -->
00252 
00253 <!-- Mathematical Operators -->
00254 <!ENTITY forall   "&#8704;"> <!-- for all, U+2200 ISOtech -->
00255 <!ENTITY part     "&#8706;"> <!-- partial differential, U+2202 ISOtech  -->
00256 <!ENTITY exist    "&#8707;"> <!-- there exists, U+2203 ISOtech -->
00257 <!ENTITY empty    "&#8709;"> <!-- empty set = null set = diameter,
00258                                      U+2205 ISOamso -->
00259 <!ENTITY nabla    "&#8711;"> <!-- nabla = backward difference,
00260                                      U+2207 ISOtech -->
00261 <!ENTITY isin     "&#8712;"> <!-- element of, U+2208 ISOtech -->
00262 <!ENTITY notin    "&#8713;"> <!-- not an element of, U+2209 ISOtech -->
00263 <!ENTITY ni       "&#8715;"> <!-- contains as member, U+220B ISOtech -->
00264 <!-- should there be a more memorable name than 'ni'? -->
00265 <!ENTITY prod     "&#8719;"> <!-- n-ary product = product sign,
00266                                      U+220F ISOamsb -->
00267 <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
00268      the same glyph might be used for both -->
00269 <!ENTITY sum      "&#8721;"> <!-- n-ary sumation, U+2211 ISOamsb -->
00270 <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
00271      though the same glyph might be used for both -->
00272 <!ENTITY minus    "&#8722;"> <!-- minus sign, U+2212 ISOtech -->
00273 <!ENTITY lowast   "&#8727;"> <!-- asterisk operator, U+2217 ISOtech -->
00274 <!ENTITY radic    "&#8730;"> <!-- square root = radical sign,
00275                                      U+221A ISOtech -->
00276 <!ENTITY prop     "&#8733;"> <!-- proportional to, U+221D ISOtech -->
00277 <!ENTITY infin    "&#8734;"> <!-- infinity, U+221E ISOtech -->
00278 <!ENTITY ang      "&#8736;"> <!-- angle, U+2220 ISOamso -->
00279 <!ENTITY and      "&#8743;"> <!-- logical and = wedge, U+2227 ISOtech -->
00280 <!ENTITY or       "&#8744;"> <!-- logical or = vee, U+2228 ISOtech -->
00281 <!ENTITY cap      "&#8745;"> <!-- intersection = cap, U+2229 ISOtech -->
00282 <!ENTITY cup      "&#8746;"> <!-- union = cup, U+222A ISOtech -->
00283 <!ENTITY int      "&#8747;"> <!-- integral, U+222B ISOtech -->
00284 <!ENTITY there4   "&#8756;"> <!-- therefore, U+2234 ISOtech -->
00285 <!ENTITY sim      "&#8764;"> <!-- tilde operator = varies with = similar to,
00286                                      U+223C ISOtech -->
00287 <!-- tilde operator is NOT the same character as the tilde, U+007E,
00288      although the same glyph might be used to represent both  -->
00289 <!ENTITY cong     "&#8773;"> <!-- approximately equal to, U+2245 ISOtech -->
00290 <!ENTITY asymp    "&#8776;"> <!-- almost equal to = asymptotic to,
00291                                      U+2248 ISOamsr -->
00292 <!ENTITY ne       "&#8800;"> <!-- not equal to, U+2260 ISOtech -->
00293 <!ENTITY equiv    "&#8801;"> <!-- identical to, U+2261 ISOtech -->
00294 <!ENTITY le       "&#8804;"> <!-- less-than or equal to, U+2264 ISOtech -->
00295 <!ENTITY ge       "&#8805;"> <!-- greater-than or equal to,
00296                                      U+2265 ISOtech -->
00297 <!ENTITY sub      "&#8834;"> <!-- subset of, U+2282 ISOtech -->
00298 <!ENTITY sup      "&#8835;"> <!-- superset of, U+2283 ISOtech -->
00299 <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
00300      font encoding and is not included. Should it be, for symmetry?
00301      It is in ISOamsn  -->
00302 <!ENTITY nsub     "&#8836;"> <!-- not a subset of, U+2284 ISOamsn -->
00303 <!ENTITY sube     "&#8838;"> <!-- subset of or equal to, U+2286 ISOtech -->
00304 <!ENTITY supe     "&#8839;"> <!-- superset of or equal to,
00305                                      U+2287 ISOtech -->
00306 <!ENTITY oplus    "&#8853;"> <!-- circled plus = direct sum,
00307                                      U+2295 ISOamsb -->
00308 <!ENTITY otimes   "&#8855;"> <!-- circled times = vector product,
00309                                      U+2297 ISOamsb -->
00310 <!ENTITY perp     "&#8869;"> <!-- up tack = orthogonal to = perpendicular,
00311                                      U+22A5 ISOtech -->
00312 <!ENTITY sdot     "&#8901;"> <!-- dot operator, U+22C5 ISOamsb -->
00313 <!-- dot operator is NOT the same character as U+00B7 middle dot -->
00314 
00315 <!-- Miscellaneous Technical -->
00316 <!ENTITY lceil    "&#8968;"> <!-- left ceiling = apl upstile,
00317                                      U+2308 ISOamsc  -->
00318 <!ENTITY rceil    "&#8969;"> <!-- right ceiling, U+2309 ISOamsc  -->
00319 <!ENTITY lfloor   "&#8970;"> <!-- left floor = apl downstile,
00320                                      U+230A ISOamsc  -->
00321 <!ENTITY rfloor   "&#8971;"> <!-- right floor, U+230B ISOamsc  -->
00322 <!ENTITY lang     "&#9001;"> <!-- left-pointing angle bracket = bra,
00323                                      U+2329 ISOtech -->
00324 <!-- lang is NOT the same character as U+003C 'less than'
00325      or U+2039 'single left-pointing angle quotation mark' -->
00326 <!ENTITY rang     "&#9002;"> <!-- right-pointing angle bracket = ket,
00327                                      U+232A ISOtech -->
00328 <!-- rang is NOT the same character as U+003E 'greater than'
00329      or U+203A 'single right-pointing angle quotation mark' -->
00330 
00331 <!-- Geometric Shapes -->
00332 <!ENTITY loz      "&#9674;"> <!-- lozenge, U+25CA ISOpub -->
00333 
00334 <!-- Miscellaneous Symbols -->
00335 <!ENTITY spades   "&#9824;"> <!-- black spade suit, U+2660 ISOpub -->
00336 <!-- black here seems to mean filled as opposed to hollow -->
00337 <!ENTITY clubs    "&#9827;"> <!-- black club suit = shamrock,
00338                                      U+2663 ISOpub -->
00339 <!ENTITY hearts   "&#9829;"> <!-- black heart suit = valentine,
00340                                      U+2665 ISOpub -->
00341 <!ENTITY diams    "&#9830;"> <!-- black diamond suit, U+2666 ISOpub -->
00342 
00343 <!-- C0 Controls and Basic Latin -->
00344 <!ENTITY quot    "&#34;"> <!-- quotation mark = APL quote,
00345                                     U+0022 ISOnum -->
00346 <!ENTITY amp     "&#38;"> <!-- ampersand, U+0026 ISOnum -->
00347 <!ENTITY lt      "&#60;"> <!-- less-than sign, U+003C ISOnum -->
00348 <!ENTITY gt      "&#62;"> <!-- greater-than sign, U+003E ISOnum -->
00349 
00350 <!-- Latin Extended-A -->
00351 <!ENTITY OElig   "&#338;"> <!-- latin capital ligature OE,
00352                                     U+0152 ISOlat2 -->
00353 <!ENTITY oelig   "&#339;"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
00354 <!-- ligature is a misnomer, this is a separate character in some languages -->
00355 <!ENTITY Scaron  "&#352;"> <!-- latin capital letter S with caron,
00356                                     U+0160 ISOlat2 -->
00357 <!ENTITY scaron  "&#353;"> <!-- latin small letter s with caron,
00358                                     U+0161 ISOlat2 -->
00359 <!ENTITY Yuml    "&#376;"> <!-- latin capital letter Y with diaeresis,
00360                                     U+0178 ISOlat2 -->
00361 
00362 <!-- Spacing Modifier Letters -->
00363 <!ENTITY circ    "&#710;"> <!-- modifier letter circumflex accent,
00364                                     U+02C6 ISOpub -->
00365 <!ENTITY tilde   "&#732;"> <!-- small tilde, U+02DC ISOdia -->
00366 
00367 <!-- General Punctuation -->
00368 <!ENTITY ensp    "&#8194;"> <!-- en space, U+2002 ISOpub -->
00369 <!ENTITY emsp    "&#8195;"> <!-- em space, U+2003 ISOpub -->
00370 <!ENTITY thinsp  "&#8201;"> <!-- thin space, U+2009 ISOpub -->
00371 <!ENTITY zwnj    "&#8204;"> <!-- zero width non-joiner,
00372                                     U+200C NEW RFC 2070 -->
00373 <!ENTITY zwj     "&#8205;"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
00374 <!ENTITY lrm     "&#8206;"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
00375 <!ENTITY rlm     "&#8207;"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
00376 <!ENTITY ndash   "&#8211;"> <!-- en dash, U+2013 ISOpub -->
00377 <!ENTITY mdash   "&#8212;"> <!-- em dash, U+2014 ISOpub -->
00378 <!ENTITY lsquo   "&#8216;"> <!-- left single quotation mark,
00379                                     U+2018 ISOnum -->
00380 <!ENTITY rsquo   "&#8217;"> <!-- right single quotation mark,
00381                                     U+2019 ISOnum -->
00382 <!ENTITY sbquo   "&#8218;"> <!-- single low-9 quotation mark, U+201A NEW -->
00383 <!ENTITY ldquo   "&#8220;"> <!-- left double quotation mark,
00384                                     U+201C ISOnum -->
00385 <!ENTITY rdquo   "&#8221;"> <!-- right double quotation mark,
00386                                     U+201D ISOnum -->
00387 <!ENTITY bdquo   "&#8222;"> <!-- double low-9 quotation mark, U+201E NEW -->
00388 <!ENTITY dagger  "&#8224;"> <!-- dagger, U+2020 ISOpub -->
00389 <!ENTITY Dagger  "&#8225;"> <!-- double dagger, U+2021 ISOpub -->
00390 <!ENTITY permil  "&#8240;"> <!-- per mille sign, U+2030 ISOtech -->
00391 <!ENTITY lsaquo  "&#8249;"> <!-- single left-pointing angle quotation mark,
00392                                     U+2039 ISO proposed -->
00393 <!-- lsaquo is proposed but not yet ISO standardized -->
00394 <!ENTITY rsaquo  "&#8250;"> <!-- single right-pointing angle quotation mark,
00395                                     U+203A ISO proposed -->
00396 <!-- rsaquo is proposed but not yet ISO standardized -->
00397 <!ENTITY euro   "&#8364;"> <!-- euro sign, U+20AC NEW -->
00398 
00399 ]>
00400 '''
00401 
00402 class visitor(object):
00403     def do(self, tree):
00404         self.visit_node_list(tree.childNodes)
00405 
00406     def visit_node_list(self, nodelist):
00407         for node in nodelist:
00408             self.visit(node)
00409 
00410     def visit(self, node):
00411         nodeType = node.nodeType
00412         if node.nodeType == Node.ELEMENT_NODE:
00413             return self.visit_element(node)
00414         elif node.nodeType == Node.ATTRIBUTE_NODE:
00415             return self.visit_attribute(node)
00416         elif node.nodeType == Node.TEXT_NODE:
00417             return self.visit_text(node)
00418         elif node.nodeType == Node.CDATA_SECTION_NODE:
00419             return self.visit_cdata_section(node)
00420 
00421     def visit_element(self, node):
00422         if len(node.childNodes):
00423             self.visit_node_list(node.childNodes)
00424 
00425     def visit_attribute(self, node):
00426         pass
00427 
00428     def visit_text(self, node):
00429         pass
00430 
00431     def visit_cdata_section(self, node):
00432         pass
00433 
00434 
00435 class strip_whitespace(visitor):
00436 
00437     def visit_element(self, node):
00438         if node.localName == 'p':
00439             # XXX: our formatter adds a whitespace at the end of each paragraph
00440             if node.hasChildNodes() and node.childNodes[-1].nodeType == Node.TEXT_NODE:
00441                 data = node.childNodes[-1].data.rstrip('\n ')
00442                 # Remove it if empty
00443                 if data == '':
00444                     node.removeChild(node.childNodes[-1])
00445                 else:
00446                     node.childNodes[-1].data = data
00447             # Remove empty paragraphs
00448             if not node.hasChildNodes():
00449                 node.parentNode.removeChild(node)
00450 
00451         if node.hasChildNodes():
00452             self.visit_node_list(node.childNodes)
00453 
00454 
00455 class convert_tree(visitor):
00456     white_space = object()
00457     new_line = object()
00458     new_line_dont_remove = object()
00459 
00460     def __init__(self, request, pagename):
00461         self.request = request
00462         self.pagename = pagename
00463 
00464     def do(self, tree):
00465         self.depth = 0
00466         self.text = []
00467         self.visit(tree.documentElement)
00468         self.check_whitespace()
00469         return ''.join(self.text)
00470 
00471     def check_whitespace(self):
00472         i = 0
00473         text = self.text
00474         while i < len(text):
00475             if text[i] is self.white_space:
00476                 if i == 0 or i == len(text)-1:
00477                     del text[i]
00478                 elif text[i-1].endswith(" ") or text[i-1].endswith("\n"):
00479                     # last char of previous element is whitespace
00480                     del text[i]
00481                 elif (text[i+1] is self.white_space or
00482                       # next element is white_space
00483                       text[i+1] is self.new_line):
00484                       # or new_line
00485                     del text[i]
00486                 elif text[i+1].startswith(" ") or text[i+1].startswith("\n"):
00487                     # first char of next element is whitespace
00488                     del text[i]
00489                 else:
00490                     text[i] = " "
00491                     i += 1
00492             elif text[i] is self.new_line:
00493                 if i == 0:
00494                     del text[i]
00495                 elif i == len(text) - 1:
00496                     text[i] = "\n"
00497                     i += 1
00498                 elif text[i-1].endswith("\n") or (
00499                       isinstance(text[i+1], str) and text[i+1].startswith("\n")):
00500                     del text[i]
00501                 else:
00502                     text[i] = "\n"
00503                     i += 1
00504             elif text[i] is self.new_line_dont_remove:
00505                 text[i] = "\n"
00506                 i += 1
00507             else:
00508                 i += 1
00509 
00510     def visit_text(self, node):
00511         self.text.append(node.data)
00512 
00513     def visit_element(self, node):
00514         name = node.localName
00515         if name is None: # not sure this can happen here (DOM comment node), but just for the case
00516             return
00517         func = getattr(self, "process_%s" % name, None)
00518         if func:
00519             func(node)
00520         else:
00521             self.process_inline(node)
00522 
00523     def visit_node_list_element_only(self, nodelist):
00524         for node in nodelist:
00525             if node.nodeType == Node.ELEMENT_NODE:
00526                 self.visit_element(node)
00527 
00528     def node_list_text_only(self, nodelist):
00529         result = []
00530         for node in nodelist:
00531             if node.nodeType == Node.TEXT_NODE:
00532                 result.append(node.data)
00533             else:
00534                 result.extend(self.node_list_text_only(node.childNodes))
00535         return "".join(result)
00536 
00537     def get_desc(self, nodelist):
00538         """ links can have either text or an image as description - we extract
00539             this from the child nodelist and return wiki markup.
00540         """
00541         markup = ''
00542         text = self.node_list_text_only(nodelist).replace("\n", " ").strip()
00543         if text:
00544             # found some text
00545             markup = text
00546         else:
00547             # search for an img / object
00548             for node in nodelist:
00549                 if node.nodeType == Node.ELEMENT_NODE:
00550                     name = node.localName
00551                     if name == 'img':
00552                         markup = self._process_img(node) # XXX problem: markup containts auto-generated alt text with link target
00553                         break
00554                     elif name == 'object':
00555                         markup = self._process_object(node)
00556                         break
00557         return markup
00558 
00559     def process_page(self, node):
00560         for i in node.childNodes:
00561             if i.nodeType == Node.ELEMENT_NODE:
00562                 self.visit_element(i)
00563             elif i.nodeType == Node.TEXT_NODE: # if this is missing, all std text under a headline is dropped!
00564                 txt = i.data.strip() # IMPORTANT: don't leave this unstripped or there will be wrong blanks
00565                 if txt:
00566                     self.text.append(txt)
00567             #we use <pre class="comment"> now, so this is currently unused:
00568             #elif i.nodeType == Node.COMMENT_NODE:
00569             #    self.text.append(i.data)
00570             #    self.text.append("\n")
00571 
00572     def process_br(self, node):
00573         self.text.append(self.new_line) # without this, std multi-line text below some heading misses a whitespace
00574                                         # when it gets merged to float text, like word word wordword word word
00575 
00576     def process_heading(self, node):
00577         text = self.node_list_text_only(node.childNodes).strip()
00578         if text:
00579             depth = int(node.localName[1])
00580             hstr = "=" * depth
00581             self.text.append(self.new_line)
00582             self.text.append("%s %s %s" % (hstr, text.replace("\n", " "), hstr))
00583             self.text.append(self.new_line)
00584 
00585     process_h1 = process_heading
00586     process_h2 = process_heading
00587     process_h3 = process_heading
00588     process_h4 = process_heading
00589     process_h5 = process_heading
00590     process_h6 = process_heading
00591 
00592     def _get_list_item_markup(self, list, listitem):
00593         before = ""
00594         #indent = str(self.depth) * self.depth # nice for debugging :)
00595         indent = " " * self.depth
00596         markup = ""
00597         name = list.localName
00598         if name == 'ol':
00599             class_ = listitem.getAttribute("class")
00600             if class_ == "gap":
00601                 before = self.new_line_dont_remove
00602             if list.hasAttribute("type"):
00603                 type = list.getAttribute("type")
00604             else:
00605                 type = "1"
00606             markup = "%s. " % type
00607         elif name == 'ul':
00608             class_ = listitem.getAttribute("class")
00609             if class_ == "gap":
00610                 before = self.new_line_dont_remove
00611             style = listitem.getAttribute("style")
00612             if re.match(ur"list-style-type:\s*none", style, re.I):
00613                 markup = ". "
00614                 # set markup with white space when list element containes table
00615                 for i in listitem.childNodes:
00616                     if i.nodeType == Node.ELEMENT_NODE:
00617                         if i.localName == 'table':
00618                             markup = ""
00619             else:
00620                 markup = "* "
00621         elif name == 'dl':
00622             markup = ":: "
00623         else:
00624             raise ConvertError("Illegal list type %s" % name)
00625         return before, indent, markup
00626 
00627     def process_dl(self, node):
00628         self.depth += 1
00629         markup = ":: " # can there be a dl dd without dt?
00630         for i in node.childNodes:
00631             if i.nodeType == Node.ELEMENT_NODE:
00632                 name = i.localName
00633                 if name == 'dt':
00634                     before, indent, markup = self._get_list_item_markup(node, i)
00635                     self.text.extend([before, indent])
00636                     text = self.node_list_text_only(i.childNodes)
00637                     self.text.append(text.replace("\n", " "))
00638                 elif name == 'dd':
00639                     self.text.append(markup)
00640                     self.process_list_item(i, indent) # XXX no dt -> indent is undefined!!!
00641                 else:
00642                     raise ConvertError("Illegal list element %s" % i.localName)
00643         self.depth -= 1
00644         if self.depth == 0:
00645             self.text.append(self.new_line_dont_remove)
00646 
00647     def process_list(self, node):
00648         self.depth += 1
00649         for i in node.childNodes:
00650             if i.nodeType == Node.ELEMENT_NODE:
00651                 name = i.localName
00652                 if name == 'li':
00653                     before, indent, markup = self._get_list_item_markup(node, i)
00654                     self.text.extend([before, indent, markup])
00655                     self.process_list_item(i, indent)
00656                 elif name in ('ol', 'ul', ):
00657                     self.process_list(i)
00658                 elif name == 'dl':
00659                     self.process_dl(i)
00660                 else:
00661                     raise ConvertError("Illegal list element %s" % i.localName)
00662         self.depth -= 1
00663         if self.depth == 0:
00664             self.text.append(self.new_line_dont_remove)
00665 
00666     process_ul = process_list
00667     process_ol = process_list
00668 
00669     def empty_paragraph_queue(self, nodelist, indent, need_indent):
00670         if need_indent:
00671             self.text.append(indent)
00672         for i in nodelist:
00673             if i.nodeType == Node.ELEMENT_NODE:
00674                 if i.localName == 'br':
00675                     self.text.append('<<BR>>')
00676                 else:
00677                     self.process_inline(i)
00678             elif i.nodeType == Node.TEXT_NODE:
00679                 self.text.append(i.data.strip('\n').replace('\n', ' '))
00680         self.text.append(self.new_line)
00681         del nodelist[:]
00682 
00683     def process_list_item(self, node, indent):
00684         found = False
00685         need_indent = False
00686         pending = []
00687 
00688         # If this is a empty list item, we just terminate the line
00689         if node.childNodes.length == 0:
00690             self.text.append(self.new_line)
00691             return
00692 
00693         for i in node.childNodes:
00694             name = i.localName
00695 
00696             if name in ('p', 'pre', 'ol', 'ul', 'dl', 'table', ) and pending:
00697                 self.empty_paragraph_queue(pending, indent, need_indent)
00698                 need_indent = True
00699 
00700             if name == 'p':
00701                 if need_indent:
00702                     self.text.append(indent)
00703                 self.process_paragraph_item(i)
00704                 self.text.append(self.new_line)
00705                 found = True
00706             elif name == 'pre':
00707                 if need_indent:
00708                     self.text.append(indent)
00709                 self.process_preformatted_item(i)
00710                 found = True
00711             elif name in ('ol', 'ul', ):
00712                 self.process_list(i)
00713                 found = True
00714             elif name == 'dl':
00715                 self.process_dl(i)
00716                 found = True
00717             elif name == 'table':
00718                 if need_indent:
00719                     self.text.append(indent)
00720                 self.process_table(i)
00721                 found = True
00722             elif name == 'br':
00723                 pending.append(i)
00724             else:
00725                 pending.append(i)
00726 
00727             if found:
00728                 need_indent = True
00729 
00730         if pending:
00731             self.empty_paragraph_queue(pending, indent, need_indent)
00732 
00733     def process_blockquote(self, node):
00734         # XXX this does not really work. e.g.:
00735         # <bq>aaaaaa
00736         # <hr---------->
00737         # <bq>bbbbbb
00738         self.depth += 1
00739         for i in node.childNodes:
00740             if i.nodeType == Node.ELEMENT_NODE:
00741                 name = i.localName
00742                 if name == 'p':
00743                     self.text.append(self.new_line)
00744                     self.text.append(" " * self.depth)
00745                     self.process_p(i)
00746                 elif name == 'pre':
00747                     self.text.append(self.new_line)
00748                     self.text.append(" " * self.depth)
00749                     self.process_pre(i)
00750                 elif name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ):
00751                     self.process_heading(i)
00752                 elif name in ('ol', 'ul', ):
00753                     self.process_list(i)
00754                 elif name == 'dl':
00755                     self.process_dl(i)
00756                 elif name == 'a':
00757                     self.process_a(i)
00758                 elif name == 'img':
00759                     self.process_img(i)
00760                 elif name == 'div':
00761                     self.visit_node_list_element_only(i.childNodes)
00762                 elif name == 'blockquote':
00763                     self.process_blockquote(i)
00764                 elif name == 'hr':
00765                     self.process_hr(i)
00766                 elif name == 'br':
00767                     self.process_br(i)
00768                 else:
00769                     raise ConvertError("process_blockquote: Don't support %s element" % name)
00770         self.depth -= 1
00771 
00772     def process_inline(self, node):
00773         if node.nodeType == Node.TEXT_NODE:
00774             self.text.append(node.data.strip('\n').replace('\n', ' '))
00775             return
00776 
00777         # do we need to check for Node.ELEMENT_NODE and return (do nothing)?
00778         name = node.localName # can be None for DOM Comment nodes
00779         if name is None:
00780             return
00781 
00782         # unsupported tags
00783         if name in (u'title', u'meta', u'style'):
00784             return
00785 
00786         if name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ): # headers are not allowed here (e.g. inside a ul li),
00787             text = self.node_list_text_only(node.childNodes).strip() # but can be inserted via the editor
00788             self.text.append(text)                          # so we just drop the header markup and keep the text
00789             return
00790 
00791         func = getattr(self, "process_%s" % name, None)
00792         if func:
00793             func(node)
00794             return
00795 
00796         command_close = None
00797         if name in ('em', 'i', ):
00798             command = "''"
00799         elif name in ('strong', 'b', ):
00800             command = "'''"
00801         elif name == 'u':
00802             command = "__"
00803         elif name == 'big':
00804             command = "~+"
00805             command_close = "+~"
00806         elif name == 'small':
00807             command = "~-"
00808             command_close = "-~"
00809         elif name == 'strike':
00810             command = "--("
00811             command_close = ")--"
00812         elif name == 'sub':
00813             command = ",,"
00814         elif name == 'sup':
00815             command = "^"
00816         elif name in ('area', 'center', 'code', 'embed', 'fieldset', 'font', 'form', 'iframe', 'input', 'label', 'link', 'map',
00817                       'meta', 'noscript', 'option', 'script', 'select', 'textarea', 'wbr'):
00818             command = "" # just throw away unsupported elements
00819         else:
00820             raise ConvertError("process_inline: Don't support %s element" % name)
00821 
00822         self.text.append(command)
00823         for i in node.childNodes:
00824             # lonly childnodes checked if they are only 'br'
00825             if command and len(node.childNodes) == 1:
00826                 # formatted br alone is not wanted (who wants a bold br?)
00827                 if i.localName != 'br':
00828                     self.process_inline(i)
00829             else:
00830                 if i.localName == 'br':
00831                     # dont make a real \n because that breaks tables
00832                     self.text.append('<<BR>>')
00833                 else:
00834                     self.process_inline(i)
00835         if command_close:
00836             command = command_close
00837         self.text.append(command)
00838 
00839     def process_span(self, node):
00840         # process span tag for firefox3
00841         node_style = node.getAttribute("style")
00842 
00843         is_strike = node.getAttribute("class") == "strike"
00844         is_strike = is_strike or "line-through" in node_style
00845         is_strong = "bold" in node_style
00846         is_italic = "italic" in node_style
00847         is_underline = "underline" in node_style
00848         is_comment = node.getAttribute("class") == "comment"
00849 
00850         # start tag
00851         if is_comment:
00852             self.text.append("/* ")
00853         if is_strike:
00854             self.text.append("--(")
00855         if is_strong:
00856             self.text.append("'''")
00857         if is_italic:
00858             self.text.append("''")
00859         if is_underline:
00860             self.text.append("__")
00861 
00862         # body
00863         for i in node.childNodes:
00864             self.process_inline(i)
00865 
00866         # end tag
00867         if is_underline:
00868             self.text.append("__")
00869         if is_italic:
00870             self.text.append("''")
00871         if is_strong:
00872             self.text.append("'''")
00873         if is_strike:
00874             self.text.append(")--")
00875         if is_comment:
00876             self.text.append(" */")
00877 
00878     def process_div(self, node):
00879         # process indent
00880         self._process_indent(node)
00881 
00882         # ignore div tags - just descend
00883         for i in node.childNodes:
00884             self.visit(i)
00885 
00886     def process_tt(self, node):
00887         text = self.node_list_text_only(node.childNodes).replace("\n", " ")
00888         if node.getAttribute("class") == "backtick":
00889             self.text.append("`%s`" % text)
00890         else:
00891             self.text.append("{{{%s}}}" % text)
00892 
00893     def process_hr(self, node):
00894         if node.hasAttribute("class"):
00895             class_ = node.getAttribute("class")
00896         else:
00897             class_ = "hr0"
00898         if class_.startswith("hr") and class_[2] in "123456":
00899             length = int(class_[2]) + 4
00900         else:
00901             length = 4
00902         self.text.extend([self.new_line, "-" * length, self.new_line])
00903 
00904     def process_p(self, node):
00905         # process indent
00906         self._process_indent(node)
00907         self.process_paragraph_item(node)
00908         self.text.append("\n\n") # do not use self.new_line here!
00909 
00910     def _process_indent(self, node):
00911         # process indent
00912         node_style = node.getAttribute("style")
00913         match = re.match(r"margin-left:\s*(\d+)px", node_style)
00914         if match:
00915             left_margin = int(match.group(1))
00916             indent_depth = int(left_margin / 40)
00917             if indent_depth > 0:
00918                 self.text.append(' . ')
00919 
00920     def process_paragraph_item(self, node):
00921         for i in node.childNodes:
00922             if i.nodeType == Node.ELEMENT_NODE:
00923                 self.process_inline(i)
00924             elif i.nodeType == Node.TEXT_NODE:
00925                 self.text.append(i.data.strip('\n').replace('\n', ' '))
00926 
00927     def process_pre(self, node):
00928         self.process_preformatted_item(node)
00929         self.text.append(self.new_line)
00930 
00931     def process_preformatted_item(self, node):
00932         if node.hasAttribute("class"):
00933             class_ = node.getAttribute("class")
00934         else:
00935             class_ = None
00936         if class_ == "comment": # we currently use this for stuff like ## or #acl
00937             for i in node.childNodes:
00938                 if i.nodeType == Node.TEXT_NODE:
00939                     self.text.append(i.data.replace('\n', ''))
00940                 elif i.localName == 'br':
00941                     self.text.append(self.new_line)
00942                 else:
00943                     pass
00944         else:
00945             content_buffer = []
00946             longest_inner_formater = ''
00947             bang_args = ''
00948             delimiters = []
00949 
00950             """
00951             below code fixed for MoinMoinBugs/GuiEditorCantNest bug
00952             this has problem when outer delimiter has two more { than inside one
00953             e.g. {{{{{{ {{{ foo }}} }}}}}}  --> {{{{ {{{ foo }}} }}}}
00954                    {{{foo {{{ }}} foo}}} --> {{{{ {{{ }}} }}}}
00955             """
00956 
00957             for i in node.childNodes:
00958                 if i.nodeType == Node.TEXT_NODE:
00959                     # get longest pre tag({{{ or }}}) from content
00960                     delimiters.extend(re.compile("((?u){+)").findall(i.data))
00961                     delimiters.extend(re.compile("((?u)}+)").findall(i.data))
00962                     # when first line is empty, start iteration second line of i.data
00963                     data_lines = i.data.rstrip().split('\n')
00964                     if data_lines[0].strip() == '':
00965                         data_lines = data_lines[1:]
00966                     for line in data_lines:
00967                         if line.strip().startswith('#!'):
00968                             if bang_args == '':
00969                                 bang_args = line.strip()
00970                             else:
00971                                 content_buffer.extend([line, self.new_line])
00972                         else:
00973                             content_buffer.extend([line, self.new_line])
00974                 elif i.localName == 'br':
00975                     content_buffer.append(self.new_line_dont_remove)
00976                 else:
00977                     pass
00978 
00979             if delimiters:
00980                 longest_inner_formater = max(delimiters)
00981 
00982             if (len(longest_inner_formater) >= 3):
00983                 self.text.extend([("{" * (len(longest_inner_formater) + 1)) + bang_args, \
00984                                       self.new_line])
00985                 self.text.extend(content_buffer)
00986                 self.text.extend(["}" * (len(longest_inner_formater) + 1), \
00987                                       self.new_line])
00988             else:
00989                 self.text.extend(["{{{"+bang_args, self.new_line])
00990                 self.text.extend(content_buffer)
00991                 self.text.extend(["}}}", self.new_line])
00992 
00993     _alignment = {"left": "(",
00994                   "center": ":",
00995                   "right": ")",
00996                   "top": "^",
00997                   "bottom": "v"}
00998 
00999     def _check_length(self, value):
01000         try:
01001             int(value)
01002             return value + 'px'
01003         except ValueError:
01004             return value
01005 
01006     def _get_color(self, node, prefix):
01007         if node.hasAttribute("bgcolor"):
01008             value = node.getAttribute("bgcolor")
01009             match = re.match(r"rgb\((\d+),\s*(\d+),\s*(\d+)\)", value)
01010             if match:
01011                 value = '#%X%X%X' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
01012             else:
01013                 match = re.match(r"#[0-9A-Fa-f]{6}", value)
01014             if not prefix and match:
01015                 result = value
01016             else:
01017                 result = '%sbgcolor="%s"' % (prefix, value)
01018         else:
01019             result = ''
01020         return result
01021 
01022     def _table_style(self, node):
01023         # TODO: attrs = get_attrs(node)
01024         result = []
01025         result.append(self._get_color(node, 'table'))
01026         if node.hasAttribute("width"):
01027             value = node.getAttribute("width")
01028             result.append('tablewidth="%s"' % self._check_length(value))
01029         if node.hasAttribute("height"):
01030             value = node.getAttribute("height")
01031             result.append('tableheight="%s"' % self._check_length(value))
01032         if node.hasAttribute("align"):
01033             value = node.getAttribute("align")
01034             result.append('tablealign="%s"' % value)
01035         if node.hasAttribute("style"):
01036             result.append('tablestyle="%s"' % node.getAttribute("style"))
01037         if node.hasAttribute("class"):
01038             result.append('tableclass="%s"' % node.getAttribute("class"))
01039         return " ".join(result).strip()
01040 
01041     def _row_style(self, node):
01042         # TODO: attrs = get_attrs(node)
01043         result = []
01044         result.append(self._get_color(node, 'row'))
01045         if node.hasAttribute("style"):
01046             result.append('rowstyle="%s"' % node.getAttribute("style"))
01047         if node.hasAttribute("class"):
01048             result.append('rowclass="%s"' % node.getAttribute("class"))
01049         return " ".join(result).strip()
01050 
01051     def _cell_style(self, node):
01052         # TODO: attrs = get_attrs(node)
01053         if node.hasAttribute("rowspan"):
01054             rowspan = ("|%s" % node.getAttribute("rowspan"))
01055         else:
01056             rowspan = ""
01057 
01058         if node.hasAttribute("colspan"):
01059             colspan = int(node.getAttribute("colspan"))
01060         else:
01061             colspan = 1
01062 
01063         spanning = rowspan or colspan > 1
01064 
01065         align = ""
01066         result = []
01067         result.append(self._get_color(node, ''))
01068         if node.hasAttribute("align"):
01069             value = node.getAttribute("align")
01070             if not spanning or value != "center":
01071                 # ignore "center" in spanning cells
01072                 align += self._alignment.get(value, "")
01073         if node.hasAttribute("valign"):
01074             value = node.getAttribute("valign")
01075             if not spanning or value != "center":
01076                 # ignore "center" in spanning cells
01077                 align += self._alignment.get(value, "")
01078         if node.hasAttribute("width"):
01079             value = node.getAttribute("width")
01080             if value[-1] == "%":
01081                 align += value
01082             else:
01083                 result.append('width="%s"' % self._check_length(value))
01084         if node.hasAttribute("height"):
01085             value = node.getAttribute("height")
01086             result.append('height="%s"' % self._check_length(value))
01087         if node.hasAttribute("class"):
01088             result.append('class="%s"' % node.getAttribute("class"))
01089         if node.hasAttribute("id"):
01090             result.append('id="%s"' % node.getAttribute("id"))
01091         if node.hasAttribute("style"):
01092             result.append('style="%s"' % node.getAttribute("style"))
01093 
01094         if align:
01095             result.insert(0, "%s" % align)
01096         result.append(rowspan)
01097         return " ".join(result).strip()
01098 
01099     def process_table(self, node, style=""):
01100         if self.depth == 0:
01101             self.text.append(self.new_line)
01102         self.new_table = True
01103         style += self._table_style(node)
01104         for i in node.childNodes:
01105             if i.nodeType == Node.ELEMENT_NODE:
01106                 name = i.localName
01107                 if name == 'tr':
01108                     self.process_table_record(i, style)
01109                     style = ""
01110                 elif name in ('thead', 'tbody', 'tfoot'):
01111                     self.process_table(i, style)
01112                 elif name == 'caption':
01113                     self.process_caption(node, i, style)
01114                     style = ''
01115                 elif name in ('col', 'colgroup', 'strong', ):
01116                     pass # we don't support these, but we just ignore them
01117                 else:
01118                     raise ConvertError("process_table: Don't support %s element" % name)
01119             #else:
01120             #    raise ConvertError("Unexpected node: %r" % i)
01121         self.text.append(self.new_line_dont_remove)
01122 
01123     def process_caption(self, table, node, style=""):
01124         # get first row
01125         for i in table.childNodes:
01126             if i.localName in ('thead', 'tbody', 'tfoot'): # XXX is this correct?
01127             #if i.localName == 'tbody': (old version)
01128                 for i in i.childNodes:
01129                     if i.localName == 'tr':
01130                         break
01131                 break
01132             elif i.localName == 'tr':
01133                 break
01134         # count columns
01135         if i.localName == 'tr':
01136             colspan = 0
01137             for td in i.childNodes:
01138                 if not td.nodeType == Node.ELEMENT_NODE:
01139                     continue
01140                 span = td.getAttribute('colspan')
01141                 try:
01142                     colspan += int(span)
01143                 except ValueError:
01144                     colspan += 1
01145         else:
01146             colspan = 1
01147         text = self.node_list_text_only(node.childNodes).replace('\n', ' ').strip()
01148         if text:
01149             if style:
01150                 style = '<%s>' % style
01151             self.text.extend(["%s%s'''%s'''||" % ('||' * colspan, style, text), self.new_line_dont_remove])
01152 
01153     def process_table_data(self, node, style=""):
01154         if node.hasAttribute("colspan"):
01155             colspan = int(node.getAttribute("colspan"))
01156         else:
01157             colspan = 1
01158         self.text.append("||" * colspan)
01159 
01160         style += self._cell_style(node)
01161         if style:
01162             self.text.append("<%s>" % style)
01163 
01164         found = False
01165         for i in node.childNodes:
01166             name = i.localName
01167             if name == 'p':
01168                 self.process_paragraph_item(i)
01169                 self.text.append(self.white_space)
01170                 found = True
01171         if not found:
01172             for i in node.childNodes:
01173                 name = i.localName
01174                 if i.nodeType == Node.ELEMENT_NODE:
01175                     if name == 'br':
01176                         # if we get a br for a cell from e.g. cut and paste from OOo
01177                         # or if someone simulates a list by enter in a cell
01178                         # it should be appended as macro BR.
01179                         self.text.append('<<BR>>')
01180                         found = True
01181                         continue
01182                     else:
01183                         self.process_inline(i)
01184                         found = True
01185                 elif i.nodeType == Node.TEXT_NODE:
01186                     data = i.data.strip('\n').replace('\n', ' ')
01187                     if data:
01188                         found = True
01189                         self.text.append(data)
01190         if not found:
01191             self.text.append(" ")
01192 
01193     def process_table_record(self, node, style=""):
01194         if not self.new_table:
01195             self.text.append(" " * self.depth)
01196         else:
01197             self.new_table = False
01198         style += self._row_style(node)
01199         for i in node.childNodes:
01200             if i.nodeType == Node.ELEMENT_NODE:
01201                 name = i.localName
01202                 if name in ('td', 'th', ):
01203                     self.process_table_data(i, style=style)
01204                     style = ""
01205                 else:
01206                     raise ConvertError("process_table_record: Don't support %s element" % name)
01207         self.text.extend(["||", self.new_line_dont_remove])
01208 
01209     def process_a(self, node):
01210         attrs = get_attrs(node)
01211 
01212         title = attrs.pop('title', '')
01213         href = attrs.pop('href', None)
01214         css_class = attrs.get('class')
01215 
01216         scriptname = self.request.script_root
01217         if scriptname == "":
01218             scriptname = "/"
01219 
01220         # can either be a link (with href) or an anchor (with e.g. id)
01221         # we don't need to support anchors here as we currently handle them as <<Anchor(id)>> macro
01222         if href:
01223             href = wikiutil.url_unquote(href)
01224 
01225             interwikiname = None
01226             desc = self.get_desc(node.childNodes)
01227 
01228             # interwiki link
01229             if css_class == "interwiki":
01230                 wikitag, wikiurl, wikitail, err = wikiutil.resolve_interwiki(
01231                     self.request, title, "") # the title has the wiki name, page = ""
01232                 if not err and href.startswith(wikiurl):
01233                     pagename = wikiutil.url_unquote(href[len(wikiurl):].lstrip('/'))
01234                     interwikiname = "%s:%s" % (wikitag, pagename)
01235                 else:
01236                     raise ConvertError("Invalid InterWiki link: '%s'" % href)
01237             elif css_class == "badinterwiki" and title:
01238                 if href == "/": # we used this as replacement for empty href
01239                     href = ""
01240                 pagename = wikiutil.url_unquote(href)
01241                 interwikiname = "%s:%s" % (title, pagename)
01242             if interwikiname and pagename == desc:
01243                 if interwiki_re.match(interwikiname+' '): # the blank is needed by interwiki_re to match
01244                     # this is valid as a free interwiki link
01245                     self.text.append("%s" % interwikiname)
01246                 else:
01247                     self.text.append("[[%s]]" % interwikiname)
01248                 return
01249             elif title == 'Self':
01250                 self.text.append('[[%s|%s]]' % (href, desc))
01251                 return
01252             elif interwikiname:
01253                 self.text.append("[[%s|%s]]" % (interwikiname, desc))
01254                 return
01255 
01256             # fix links generated by a broken copy & paste of gecko based browsers
01257             brokenness = '../../../..'
01258             if href.startswith(brokenness):
01259                 href = href[len(brokenness):] # just strip it away!
01260             # TODO: IE pastes complete http://server/Page/SubPage as href and as text, too
01261 
01262             # Attachments
01263             if title.startswith("attachment:"):
01264                 attname = wikiutil.url_unquote(title[len("attachment:"):])
01265                 if 'do=get' in href: # quick&dirty fix for not dropping &do=get param
01266                     parms = '|&do=get'
01267                 else:
01268                     parms = ''
01269                 if attname != desc:
01270                     desc = '|%s' % desc
01271                 elif parms:
01272                     desc = '|'
01273                 else:
01274                     desc = ''
01275                 self.text.append('[[attachment:%s%s%s]]' % (attname, desc, parms))
01276             # wiki link
01277             elif href.startswith(scriptname):
01278                 pagename = href[len(scriptname):]
01279                 pagename = pagename.lstrip('/')    # XXX temp fix for generated pagenames starting with /
01280                 if desc == pagename:
01281                     self.text.append(wikiutil.pagelinkmarkup(pagename))
01282                 # relative link /SubPage
01283                 elif desc.startswith('/') and href.endswith(desc):
01284                     if pagename.startswith(self.pagename): # is this a subpage of us?
01285                         self.text.append(wikiutil.pagelinkmarkup(pagename[len(self.pagename):]))
01286                     else:
01287                         self.text.append(wikiutil.pagelinkmarkup(pagename))
01288                 # relative link ../
01289                 elif desc.startswith('../') and href.endswith(desc[3:]):
01290                     self.text.append(wikiutil.pagelinkmarkup(desc))
01291                 # internal link #internal
01292                 elif '#' in href and pagename.startswith(self.pagename):
01293                     self.text.append(wikiutil.pagelinkmarkup(href[href.index('#'):], desc))
01294                 # labeled link
01295                 else:
01296                     self.text.append(wikiutil.pagelinkmarkup(pagename, desc))
01297             # mailto link
01298             elif href.startswith("mailto:"):
01299                 if href == desc or href[len("mailto:"):] == desc:
01300                     self.text.extend([self.white_space, desc, self.white_space])
01301                 else:
01302                     self.text.append("[[%s|%s]]" % (href, desc)) # XXX use a (renamed) pagelinkmarkup
01303             # link
01304             else:
01305                 if href == desc:
01306                     href = href.replace(" ", "%20")
01307                     self.text.append(href)
01308                 else:
01309                     href = href.replace(" ", "%20")
01310                     if desc:
01311                         desc = '|' + desc
01312                     self.text.append("[[%s%s]]" % (href, desc))
01313 
01314     def process_img(self, node):
01315         markup = self._process_img(node)
01316         self.text.extend([self.white_space, markup, self.white_space])
01317 
01318     def _process_img(self, node):
01319         attrs = get_attrs(node)
01320 
01321         title = attrs.pop('title', '')
01322         if title.startswith("smiley:"):
01323             markup = title[len("smiley:"):]
01324             return markup
01325 
01326         alt = attrs.pop('alt', None)
01327         src = attrs.pop('src', None)
01328         css_class = attrs.get('class')
01329 
01330         target = src
01331         if title.startswith("attachment:"):
01332             target = wikiutil.url_unquote(title)
01333             if alt == title[len("attachment:"):]:
01334                 # kill auto-generated alt
01335                 alt = None
01336         elif title.startswith("drawing:"):
01337             target = wikiutil.url_unquote(title)
01338             if alt == title[len("drawing:"):]:
01339                 # kill auto-generated alt
01340                 alt = None
01341         else:
01342             if css_class == 'external_image':
01343                 # kill auto-generated alt and class
01344                 if src == alt:
01345                     alt = None
01346                 del attrs['class']
01347 
01348         if alt:
01349             desc = '|' + alt
01350         else:
01351             desc = ''
01352 
01353         params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items()])
01354                            # if k in ('width', 'height', )])
01355         if params:
01356             params = '|' + params
01357             if not desc:
01358                 desc = '|'
01359 
01360         markup = "{{%s%s%s}}" % (target, desc, params)
01361         return markup
01362 
01363     def process_object(self, node):
01364         markup = self._process_object(node)
01365         self.text.append(markup)
01366 
01367     def _process_object(self, node):
01368         attrs = get_attrs(node)
01369         markup = ''
01370         data = attrs.pop('data', None)
01371         if data:
01372             scheme, netloc, path, params, query, fragment = urlparse.urlparse(data)
01373             args = url_decode(query)
01374             action = args.get("action")
01375             attachname = args.get("target")
01376 
01377             if (not scheme and not netloc # same server (local attachment!)
01378                 and path and action == 'AttachFile' and attachname):
01379                 scriptname = self.request.script_root or "/"
01380                 pagename = path[len(scriptname):].lstrip("/")
01381                 pagename = wikiutil.url_unquote(pagename)
01382 
01383                 if pagename != self.request.page.page_name:
01384                     attachname = "%s/%s" % (pagename, attachname)
01385                 data = "attachment:%s" % attachname
01386 
01387             desc = self.get_desc(node.childNodes)
01388             if desc:
01389                 desc = '|' + desc
01390 
01391             # Exlude 'type' attribute cause it generates a 'key already present' error.
01392             params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items() if not k in ('type', )])
01393             if params:
01394                 params = '|' + params
01395                 if not desc:
01396                     desc = '|'
01397             markup = "{{%s%s%s}}" % (data, desc, params)
01398         return markup
01399         # TODO: for target PAGES, use some code from process_a to get the pagename from URL
01400         # TODO: roundtrip attachment: correctly
01401         # TODO: handle object's content better?
01402 
01403 def get_attrs(node):
01404     """ get the attributes of <node> into an easy-to-use dict """
01405     attrs = {}
01406     for attr_name in node.attributes.keys():
01407         # get attributes of style element
01408         if attr_name == "style":
01409             for style_element in node.attributes.get(attr_name).nodeValue.split(';'):
01410                 if style_element.strip() != '':
01411                     style_elements = style_element.split(':')
01412                     if len(style_elements) == 2:
01413                         attrs[style_elements[0].strip()] = style_elements[1].strip()
01414         # get attributes without style element
01415         else:
01416             attrs[attr_name] = node.attributes.get(attr_name).nodeValue
01417     return attrs
01418 
01419 
01420 def parse(request, text):
01421     text = u'<?xml version="1.0"?>%s%s' % (dtd, text)
01422     text = text.encode(config.charset)
01423     try:
01424         return xml.dom.minidom.parseString(text)
01425     except xml.parsers.expat.ExpatError, msg:
01426         # this sometimes crashes when it should not, so save the stuff to analyze it:
01427         logname = os.path.join(request.cfg.data_dir, "expaterror.log")
01428         f = file(logname, "w")
01429         f.write(text)
01430         f.write("\n" + "-"*80 + "\n" + str(msg))
01431         f.close()
01432         raise ConvertError('ExpatError: %s (see dump in %s)' % (msg, logname))
01433 
01434 def convert(request, pagename, text):
01435     # Due to expat needing explicitly set namespaces, we set these here to allow pasting
01436     # from Word / Excel without issues.
01437     # If you encounter 'ExpatError: unbound prefix', try adding the namespace to the list.
01438     namespace = [u'xmlns:o="urn:schemas-microsoft-com:office:office"',
01439                  u'xmlns:x="urn:schemas-microsoft-com:office:excel"',
01440                  u'xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"',
01441                  u'xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet"',
01442                  u'xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882"',
01443                  u'xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"',
01444                  u'xmlns:rs="urn:schemas-microsoft-com:rowset"',
01445                  u'xmlns:z="#RowsetSchema"',
01446                  u'xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml"',
01447                  u'xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core"',
01448                  u'xmlns:aml="http://schemas.microsoft.com/aml/2001/core"',
01449                  u'xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml"',
01450                  u'xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint"',
01451                  u'xmlns:w10="urn:schemas-microsoft-com:office:word"',
01452                  u'xmlns:v="urn:schemas-microsoft-com:office:vml"']
01453     text = u'<page %s>%s</page>' % (' '.join(namespace), text)
01454     tree = parse(request, text)
01455     strip_whitespace().do(tree)
01456     text = convert_tree(request, pagename).do(tree)
01457     text = '\n'.join([s.rstrip() for s in text.splitlines()] + ['']) # remove trailing blanks
01458     return text
01459