Back to index

plone3  3.1.7
pdf_to_html.py
Go to the documentation of this file.
00001 """
00002 Uses the http://sf.net/projects/pdftohtml bin to do its handy work
00003 
00004 """
00005 from Products.PortalTransforms.interfaces import itransform
00006 from Products.PortalTransforms.libtransforms.utils import bin_search, sansext
00007 from Products.PortalTransforms.libtransforms.commandtransform import commandtransform
00008 from Products.PortalTransforms.libtransforms.commandtransform import popentransform
00009 from Products.CMFDefault.utils import bodyfinder
00010 import os
00011 
00012 class popen_pdf_to_html(popentransform):
00013     __implements__ = itransform
00014     
00015     __version__ = '2004-07-02.01'
00016 
00017     __name__ = "pdf_to_html"
00018     inputs   = ('application/pdf',)
00019     output  = 'text/html'
00020     output_encoding = 'utf-8'
00021 
00022     binaryName = "pdftohtml"
00023     binaryArgs = "%(infile)s -noframes -stdout -enc UTF-8"
00024     useStdin = False
00025 
00026     def getData(self, couterr):
00027         return bodyfinder(couterr.read())
00028 
00029 class pdf_to_html(commandtransform):
00030     __implements__ = itransform
00031 
00032     __name__ = "pdf_to_html"
00033     inputs   = ('application/pdf',)
00034     output  = 'text/html'
00035     output_encoding = 'utf-8'
00036 
00037     binaryName = "pdftohtml"
00038     binaryArgs = "-noframes -enc UTF-8"
00039 
00040     def __init__(self):
00041         commandtransform.__init__(self, binary=self.binaryName)
00042 
00043     def convert(self, data, cache, **kwargs):
00044         kwargs['filename'] = 'unknown.pdf'
00045 
00046         tmpdir, fullname = self.initialize_tmpdir(data, **kwargs)
00047         html = self.invokeCommand(tmpdir, fullname)
00048         path, images = self.subObjects(tmpdir)
00049         objects = {}
00050         if images:
00051             self.fixImages(path, images, objects)
00052         self.cleanDir(tmpdir)
00053         cache.setData(bodyfinder(html))
00054         cache.setSubObjects(objects)
00055         return cache
00056 
00057     def invokeCommand(self, tmpdir, fullname):
00058         if os.name=='posix':
00059             cmd = 'cd "%s" && %s %s "%s" 2>error_log 1>/dev/null' % (
00060                    tmpdir, self.binary, self.binaryArgs, fullname)
00061         else:
00062             cmd = 'cd "%s" && %s %s "%s"' % (
00063                   tmpdir, self.binary, self.binaryArgs, fullname)
00064         os.system(cmd)
00065         try:
00066             htmlfilename = os.path.join(tmpdir, sansext(fullname) + '.html')
00067             htmlfile = open(htmlfilename, 'r')
00068             html = htmlfile.read()
00069             htmlfile.close()
00070         except:
00071             try:
00072                 return open("%s/error_log" % tmpdir, 'r').read()
00073             except:
00074                 return "transform failed while running %s (maybe this pdf file doesn't support transform)" % cmd
00075         return html
00076 
00077 def register():
00078     return pdf_to_html()