Back to index

moin  1.9.0~rc2
text_docbook.py
Go to the documentation of this file.
00001 # -*- coding: iso-8859-1 -*-
00002 """
00003     MoinMoin - DocBook-XML Parser
00004 
00005     This code was tested with 4Suite 1.0a4 and 1.0b1
00006 
00007     @copyright: 2005 Henry Ho <henryho167 AT hotmail DOT com>,
00008                 2005 MoinMoin:AlexanderSchremmer
00009     @license: GNU GPL, see COPYING for details.
00010 
00011     DOCBOOK Parser:
00012 
00013     Features:
00014     - image support through Attachment
00015     - internal Wikilinks if a word is a strict wikiname
00016     - image alt is perserved
00017     - works with compiled xslt stylesheet for optimized performance
00018 
00019     Configuration:
00020     - make sure you have installed the DocBook XSLT files
00021     - set the path to the html directory of the DocBook XSLT files in your
00022       wiki or farm configuration:
00023       docbook_html_dir = r"/usr/share/xml/docbook/stylesheet/nwalsh/html/"
00024       Note that this directory needs to be writable because a cache file will
00025       be created there.
00026 
00027     >How can I use Ft API for DTD validation?
00028     If you have PyXMl installed, you can use ValidatingReader rather than
00029     NonvalidatingReader.  See:
00030     http://uche.ogbuji.net/tech/akara/nodes/2003-01-01/domlettes
00031 """
00032 
00033 import os.path
00034 import cPickle
00035 import re
00036 
00037 from MoinMoin import  Page
00038 from MoinMoin.parser.text_xslt import Parser as XsltParser
00039 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
00040 
00041 Dependencies = []
00042 
00043 class Parser(XsltParser):
00044     """
00045         Send XML file formatted via XSLT.
00046     """
00047 
00048     caching = 1
00049     Dependencies = Dependencies
00050 
00051     def __init__(self, raw, request, **kw):
00052         XsltParser.__init__(self, raw, request)
00053 
00054         # relative path to docbook.xsl and compiled_xsl
00055         docbook_html_directory = request.cfg.docbook_html_dir
00056         self.db_xsl = os.path.join(docbook_html_directory, 'docbook.xsl')
00057         self.db_compiled_xsl = os.path.join(docbook_html_directory, 'db_compiled.dat')
00058 
00059         self.wikiParser = WikiParser(raw=self.raw, request=self.request, pretty_url=1)
00060         self.key = 'docbook'
00061 
00062     def format(self, formatter):
00063         self.wikiParser.formatter = formatter
00064         XsltParser.format(self, formatter)
00065 
00066     def append_stylesheet(self):
00067         """"
00068             virtual function, for docbook parser
00069         """
00070         abs_db_xsl = os.path.abspath(self.db_xsl)
00071         abs_db_compiled_xsl = os.path.abspath(self.db_compiled_xsl)
00072 
00073         # same as path.exists, but also test if it is a file
00074         if not os.path.isfile(abs_db_compiled_xsl):
00075             _compile_xsl(abs_db_xsl, abs_db_compiled_xsl)
00076 
00077         assert os.path.isfile(abs_db_compiled_xsl)
00078 
00079         self.processor.appendStylesheetInstance(cPickle.load(file(abs_db_compiled_xsl, 'rb')))
00080 
00081     def parse_result(self, result):
00082         """
00083         additional parsing to the resulting XSLT'ed result (resultString) before saving
00084 
00085         will do:
00086             BASIC CLEAN UP   : remove unnecessary HTML tags
00087             RESOLVE IMG SRC  : fix src to find attachment
00088             RESOLVE WikiNames: if a word is a valid wikiname & a valid wikipage,
00089                                replace word with hyperlink
00090         """
00091 
00092         # BASIC CLEAN UP
00093         # remove from beginning until end of body tag
00094         found = re.search('<body.*?>', result)
00095         if found:
00096             result = result[found.end():]
00097 
00098         # remove everything after & including </body>
00099         found = result.rfind('</body>')
00100         if found != -1:
00101             result = result[:found]
00102 
00103         # RESOLVE IMG SRC
00104         found = re.finditer('<img.*?>', result)
00105         if found:
00106             splitResult = _splitResult(found, result)
00107             for index in range(len(splitResult)):
00108                 if splitResult[index].startswith('<img'):
00109                     found = re.search('src="(?P<source>.*?)"', splitResult[index])
00110                     imageSrc = found.group('source')
00111                     imageAlt = None # save alt
00112                     found = re.search('alt="(?P<alt>.*?)"', splitResult[index])
00113                     if found:
00114                         imageAlt = found.group('alt')
00115                     splitResult[index] = self.wikiParser.attachment(('attachment:' + imageSrc, ""))
00116                     if imageAlt: # restore alt
00117                         splitResult[index] = re.sub('alt=".*?"', 'alt="%s"' % imageAlt, splitResult[index])
00118 
00119             result = ''.join(splitResult)
00120 
00121 
00122         # RESOLVE WikiNames
00123         #    if a word is a valid wikiname & a valid wikipage,
00124         #    replace word with hyperlink
00125 
00126         found = re.finditer(self.wikiParser.word_rule, result, re.UNICODE|re.VERBOSE)
00127         if found:
00128             splitResult = _splitResult(found, result)
00129 
00130             for index in range(len(splitResult)):
00131                 if (re.match(self.wikiParser.word_rule, splitResult[index], re.UNICODE|re.VERBOSE)
00132                     and Page.Page(self.request, splitResult[index]).exists()):
00133                     splitResult[index] = self.wikiParser._word_repl(splitResult[index])
00134             result = ''.join(splitResult)
00135 
00136         # remove stuff that fail HTML 4.01 Strict verification
00137 
00138         # remove unsupported attributes
00139         result = re.sub(' target=".*?"| type=".*?"', '', result)
00140         result = re.sub('<hr .*?>', '<hr>', result)
00141 
00142         # remove <p>...</p> inside <a>...</a> or <caption>...</caption>
00143         found = re.finditer('<a href=".*?</a>|<caption>.*?</caption>', result) # XXX re.DOTALL)
00144         if found:
00145             splitResult = _splitResult(found, result)
00146             for index in range(len(splitResult)):
00147                 if (splitResult[index].startswith('<a href="')
00148                     or splitResult[index].startswith('<caption>')):
00149                     splitResult[index] = splitResult[index].replace('<p>', '').replace('</p>', '')
00150             result = ''.join(splitResult)
00151 
00152         return result
00153 
00154 
00155 
00156 def _compile_xsl(XSLT_FILE, XSLT_COMPILED_FILE):
00157     """
00158         compiling docbook stylesheet
00159 
00160         reference: http://155.210.85.193:8010/ccia/nodes/2005-03-18/compileXslt?xslt=/akara/akara.xslt
00161     """
00162     from Ft.Xml.Xslt.Processor import Processor
00163     from Ft.Xml.Xslt import Stylesheet
00164     from Ft.Xml import InputSource
00165     from Ft.Lib import Uri
00166 
00167     # New docbook processor
00168     db_processor = Processor()
00169 
00170     # Docbook Stylesheet
00171     my_sheet_uri = Uri.OsPathToUri(XSLT_FILE, 1)
00172     sty_isrc = InputSource.DefaultFactory.fromUri(my_sheet_uri)
00173 
00174     # Append Stylesheet
00175     db_processor.appendStylesheet(sty_isrc)
00176 
00177     # Pickled stylesheet will be self.abs_db_compiled_xsl file
00178     db_root = db_processor.stylesheet.root
00179     fw = file(XSLT_COMPILED_FILE, 'wb')
00180     cPickle.dump(db_root, fw) # , protocol=2)
00181     fw.close()
00182 
00183 
00184 def _splitResult(iterator, result):
00185     startpos = 0
00186     splitResult = []
00187 
00188     for f in iterator:
00189         start, end = f.span()
00190         splitResult.append(result[startpos:start])
00191         splitResult.append(result[start:end])
00192         startpos = end
00193     splitResult.append(result[startpos:])
00194 
00195     return splitResult
00196