Back to index

plone3  3.1.7
pdf_to_text.py
Go to the documentation of this file.
00001 """
00002 Uses the xpdf (www.foolabs.com/xpdf)
00003 """
00004 
00005 from Products.PortalTransforms.interfaces import itransform
00006 from Products.PortalTransforms.libtransforms.utils import bin_search, sansext
00007 from Products.PortalTransforms.libtransforms.commandtransform import commandtransform
00008 from Products.PortalTransforms.libtransforms.commandtransform import popentransform
00009 import os
00010 
00011 class pdf_to_text(popentransform):
00012     __implements__ = itransform
00013 
00014     __name__ = "pdf_to_text"
00015     inputs   = ('application/pdf',)
00016     output  = 'text/plain'
00017     output_encoding = 'utf-8'
00018     
00019     __version__ = '2004-07-02.01'
00020 
00021     binaryName = "pdftotext"
00022     binaryArgs = "%(infile)s -enc UTF-8 -"
00023     useStdin = False
00024 
00025 class old_pdf_to_text(commandtransform):
00026     __implements__ = itransform
00027 
00028     __name__ = "pdf_to_text"
00029     inputs   = ('application/pdf',)
00030     output  = 'text/plain'
00031     output_encoding = 'utf-8'
00032 
00033     binaryName = "pdftotext"
00034 
00035     def __init__(self):
00036         commandtransform.__init__(self, binary=self.binaryName)
00037 
00038     def convert(self, data, cache, **kwargs):
00039         kwargs['filename'] = 'unkown.pdf'
00040 
00041         tmpdir, fullname = self.initialize_tmpdir(data, **kwargs)
00042         text = self.invokeCommand(tmpdir, fullname)
00043         path, images = self.subObjects(tmpdir)
00044         objects = {}
00045         if images:
00046             self.fixImages(path, images, objects)
00047         self.cleanDir(tmpdir)
00048         cache.setData(text)
00049         cache.setSubObjects(objects)
00050         return cache
00051 
00052     def invokeCommand(self, tmpdir, fullname):
00053         # FIXME: windows users...
00054         textfile = "%s/%s.txt" % (tmpdir, sansext(fullname))
00055         cmd = 'cd "%s" && %s -enc UTF-8 "%s" "%s" 2>error_log 1>/dev/null' % (
00056             tmpdir, self.binary, fullname, textfile)
00057         os.system(cmd)
00058         try:
00059             text = open(textfile).read()
00060         except:
00061             try:
00062                 return open("%s/error_log" % tmpdir, 'r').read()
00063             except:
00064                 return ''
00065         return text
00066 
00067 def register():
00068     return pdf_to_text()