Back to index

moin  1.9.0~rc2
mailimport.py
Go to the documentation of this file.
00001 """
00002     MoinMoin - E-Mail Import into wiki
00003 
00004     Just call this script with the URL of the wiki as a single argument
00005     and feed the mail into stdin.
00006 
00007     @copyright: 2006 MoinMoin:AlexanderSchremmer,
00008                 2006 MoinMoin:ThomasWaldmann
00009     @license: GNU GPL, see COPYING for details.
00010 """
00011 
00012 import sys, re, time
00013 import email
00014 from email.Utils import getaddresses, parsedate_tz, mktime_tz
00015 
00016 from MoinMoin import user
00017 from MoinMoin.action.AttachFile import add_attachment, AttachmentAlreadyExists
00018 from MoinMoin.Page import Page
00019 from MoinMoin.PageEditor import PageEditor
00020 # python, at least up to 2.4, ships a broken parser for headers
00021 from MoinMoin.support.HeaderFixed import decode_header
00022 
00023 infile = sys.stdin
00024 
00025 debug = False
00026 
00027 re_sigstrip = re.compile("\r?\n-- \r?\n.*$", re.S)
00028 
00029 class attachment(object):
00030     """ Represents an attachment of a mail. """
00031     def __init__(self, filename, mimetype, data):
00032         self.filename = filename
00033         self.mimetype = mimetype
00034         self.data = data
00035 
00036     def __repr__(self):
00037         return "<attachment filename=%r mimetype=%r size=%i bytes>" % (
00038             self.filename, self.mimetype, len(self.data))
00039 
00040 class ProcessingError(Exception):
00041     pass
00042 
00043 def log(text):
00044     if debug:
00045         print >> sys.stderr, text
00046 
00047 def decode_2044(header):
00048     """ Decodes header field. See RFC 2044. """
00049     chunks = decode_header(header)
00050     chunks_decoded = []
00051     for i in chunks:
00052         chunks_decoded.append(i[0].decode(i[1] or 'ascii'))
00053     return u''.join(chunks_decoded).strip()
00054 
00055 def email_to_markup(request, email):
00056     """ transform the (realname, mailaddr) tuple we get in email argument to
00057         some string usable as wiki markup, that represents that person (either
00058         HomePage link for a wiki user, or just the realname of the person). """
00059     realname, mailaddr = email
00060     u = user.get_by_email_address(request, mailaddr)
00061     if u:
00062         markup = u.wikiHomeLink()
00063     else:
00064         markup = realname or mailaddr
00065     return markup
00066 
00067 def get_addrs(message, header):
00068     """ get a list of tuples (realname, mailaddr) from the specified header """
00069     dec_hdr = [decode_2044(hdr) for hdr in message.get_all(header, [])]
00070     return getaddresses(dec_hdr)
00071 
00072 def process_message(message):
00073     """ Processes the read message and decodes attachments. """
00074     attachments = []
00075     html_data = []
00076     text_data = []
00077 
00078     from_addr = get_addrs(message, 'From')[0]
00079     to_addrs = get_addrs(message, 'To')
00080     cc_addrs = get_addrs(message, 'Cc')
00081     bcc_addrs = get_addrs(message, 'Bcc') # depending on sending MTA, this can be present or not
00082     envelope_to_addrs = get_addrs(message, 'X-Original-To') + get_addrs(message, 'X-Envelope-To') # Postfix / Sendmail does this
00083     target_addrs = to_addrs + cc_addrs + bcc_addrs + envelope_to_addrs
00084 
00085     subject = decode_2044(message['Subject'])
00086     date = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(mktime_tz(parsedate_tz(message['Date']))))
00087 
00088     log("Processing mail:\n To: %r\n From: %r\n Subject: %r" % (to_addrs[0], from_addr, subject))
00089 
00090     for part in message.walk():
00091         log(" Part " + repr((part.get_charsets(), part.get_content_charset(), part.get_content_type(), part.is_multipart(), )))
00092         ct = part.get_content_type()
00093         cs = part.get_content_charset() or "latin1"
00094         payload = part.get_payload(None, True)
00095 
00096         fn = part.get_filename()
00097         if fn is not None and fn.startswith("=?"): # heuristics ...
00098             fn = decode_2044(fn)
00099 
00100         if fn is None and part["Content-Disposition"] is not None and "attachment" in part["Content-Disposition"]:
00101             # this doesn't catch the case where there is no content-disposition but there is a file to offer to the user
00102             # i hope that this can be only found in mails that are older than 10 years,
00103             # so I won't care about it here
00104             fn = part["Content-Description"] or "NoName"
00105         if fn:
00106             a = attachment(fn, ct, payload)
00107             attachments.append(a)
00108         else:
00109             if ct == 'text/plain':
00110                 text_data.append(payload.decode(cs))
00111                 log(repr(payload.decode(cs)))
00112             elif ct == 'text/html':
00113                 html_data.append(payload.decode(cs))
00114             elif not part.is_multipart():
00115                 log("Unknown mail part " + repr((part.get_charsets(), part.get_content_charset(), part.get_content_type(), part.is_multipart(), )))
00116 
00117     return {'text': u"".join(text_data), 'html': u"".join(html_data),
00118             'attachments': attachments,
00119             'target_addrs': target_addrs,
00120             'to_addrs': to_addrs, 'cc_addrs': cc_addrs, 'bcc_addrs': bcc_addrs, 'envelope_to_addrs': envelope_to_addrs,
00121             'from_addr': from_addr,
00122             'subject': subject, 'date': date}
00123 
00124 def get_pagename_content(request, msg):
00125     """ Generates pagename and content according to the specification
00126         that can be found on MoinMoin:FeatureRequests/WikiEmailintegration """
00127     generate_summary = False
00128     choose_html = True
00129 
00130     cfg = request.cfg
00131     email_subpage_template = cfg.mail_import_subpage_template
00132     email_pagename_envelope = cfg.mail_import_pagename_envelope
00133     wiki_addrs = cfg.mail_import_wiki_addrs
00134     search_list = cfg.mail_import_pagename_search
00135     re_subject = re.compile(cfg.mail_import_pagename_regex)
00136 
00137     subj = msg['subject'].strip()
00138     pagename_tpl = ""
00139     for method in search_list:
00140         if method == 'to':
00141             for addr in msg['target_addrs']:
00142                 if addr[1].strip().lower() in wiki_addrs:
00143                     pagename_tpl = addr[0]
00144                     # special fix for outlook users :-)
00145                     if pagename_tpl and pagename_tpl[-1] == pagename_tpl[0] == "'":
00146                         pagename_tpl = pagename_tpl[1:-1]
00147                     if pagename_tpl:
00148                         break
00149         elif method == 'subject':
00150             m = re_subject.search(subj)
00151             if m:
00152                 pagename_tpl = m.group(1)
00153                 # remove the pagename template from the subject:
00154                 subj = re_subject.sub('', subj, 1).strip()
00155         if pagename_tpl:
00156             break
00157 
00158     pagename_tpl = pagename_tpl.strip()
00159     # last resort
00160     if not pagename_tpl:
00161         pagename_tpl = email_subpage_template
00162 
00163     if not subj:
00164         subj = '(...)' # we need non-empty subject
00165     msg['subject'] = subj
00166 
00167     # for normal use, email_pagename_envelope is just u"%s" - so nothing changes.
00168     # for special use, you can use u"+ %s/" - so you don't need to enter "+"
00169     # and "/" in every email, but you get the result as if you did.
00170     pagename_tpl = email_pagename_envelope % pagename_tpl
00171 
00172     if pagename_tpl.endswith("/"):
00173         pagename_tpl += email_subpage_template
00174 
00175     subject = msg['subject'].replace('/', '\\') # we can't use / in pagenames
00176 
00177     # rewrite using string.formatter when python 2.4 is mandatory
00178     pagename = (pagename_tpl.replace("$from", msg['from_addr'][0]).
00179                 replace("$date", msg['date']).
00180                 replace("$subject", subject))
00181 
00182     if pagename.startswith("+ ") and "/" in pagename:
00183         generate_summary = True
00184         pagename = pagename[1:].lstrip()
00185 
00186     pagename = wikiutil.normalize_pagename(pagename, request.cfg)
00187 
00188     if choose_html and msg['html']:
00189         content = "{{{#!html\n%s\n}}}" % msg['html'].replace("}}}", "} } }")
00190     else:
00191         # strip signatures ...
00192         content = re_sigstrip.sub("", msg['text'])
00193 
00194     return {'pagename': pagename, 'content': content, 'generate_summary': generate_summary}
00195 
00196 def import_mail_from_string(request, string):
00197     """ Reads an RFC 822 compliant message from a string and imports it
00198         to the wiki. """
00199     return import_mail_from_message(request, email.message_from_string(string))
00200 
00201 def import_mail_from_file(request, infile):
00202     """ Reads an RFC 822 compliant message from the file `infile` and imports it to
00203         the wiki. """
00204     return import_mail_from_message(request, email.message_from_file(infile))
00205 
00206 def import_mail_from_message(request, message):
00207     """ Reads a message generated by the email package and imports it
00208         to the wiki. """
00209     _ = request.getText
00210     msg = process_message(message)
00211 
00212     wiki_addrs = request.cfg.mail_import_wiki_addrs
00213 
00214     request.user = user.get_by_email_address(request, msg['from_addr'][1])
00215 
00216     if not request.user:
00217         raise ProcessingError("No suitable user found for mail address %r" % (msg['from_addr'][1], ))
00218 
00219     d = get_pagename_content(request, msg)
00220     pagename = d['pagename']
00221     generate_summary = d['generate_summary']
00222 
00223     comment = u"Mail: '%s'" % (msg['subject'], )
00224 
00225     page = PageEditor(request, pagename, do_editor_backup=0)
00226 
00227     if not request.user.may.save(page, "", 0):
00228         raise ProcessingError("Access denied for page %r" % pagename)
00229 
00230     attachments = []
00231 
00232     for att in msg['attachments']:
00233         i = 0
00234         while i < 1000: # do not create a gazillion attachments if something
00235                         # strange happens, give up after 1000.
00236             if i == 0:
00237                 fname = att.filename
00238             else:
00239                 components = att.filename.split(".")
00240                 new_suffix = "-" + str(i)
00241                 # add the counter before the file extension
00242                 if len(components) > 1:
00243                     fname = u"%s%s.%s" % (u".".join(components[:-1]), new_suffix, components[-1])
00244                 else:
00245                     fname = att.filename + new_suffix
00246             try:
00247                 # att.data can be None for forwarded message content - we can
00248                 # just ignore it, the forwarded message's text will be present
00249                 # nevertheless
00250                 if att.data is not None:
00251                     # get the fname again, it might have changed
00252                     fname, fsize = add_attachment(request, pagename, fname, att.data)
00253                     attachments.append(fname)
00254                 break
00255             except AttachmentAlreadyExists:
00256                 i += 1
00257 
00258     # build an attachment link table for the page with the e-mail
00259     attachment_links = [""] + [u'''[[attachment:%s|%s]]''' % ("%s/%s" % (pagename, att), att) for att in attachments]
00260 
00261     # assemble old page content and new mail body together
00262     old_content = Page(request, pagename).get_raw_body()
00263     if old_content:
00264         new_content = u"%s\n-----\n" % old_content
00265     else:
00266         new_content = ''
00267 
00268     #if not (generate_summary and "/" in pagename):
00269     #generate header in any case:
00270     new_content += u"'''Mail: %s (%s, <<DateTime(%s)>>)'''\n\n" % (msg['subject'], email_to_markup(request, msg['from_addr']), msg['date'])
00271 
00272     new_content += d['content']
00273     new_content += "\n" + u"\n * ".join(attachment_links)
00274 
00275     try:
00276         page.saveText(new_content, 0, comment=comment)
00277     except page.AccessDenied:
00278         raise ProcessingError("Access denied for page %r" % pagename)
00279 
00280     if generate_summary and "/" in pagename:
00281         parent_page = u"/".join(pagename.split("/")[:-1])
00282         old_content = Page(request, parent_page).get_raw_body().splitlines()
00283 
00284         found_table = None
00285         table_ends = None
00286         for lineno, line in enumerate(old_content):
00287             if line.startswith("## mail_overview") and old_content[lineno+1].startswith("||"):
00288                 found_table = lineno
00289             elif found_table is not None and line.startswith("||"):
00290                 table_ends = lineno + 1
00291             elif table_ends is not None and not line.startswith("||"):
00292                 break
00293 
00294         # in order to let the gettext system recognise the <<GetText>> calls used below,
00295         # we must repeat them here:
00296         [_("Date"), _("From"), _("To"), _("Content"), _("Attachments")]
00297 
00298         table_header = (u"\n\n## mail_overview (don't delete this line)\n" +
00299                         u"|| '''<<GetText(Date)>> ''' || '''<<GetText(From)>> ''' || '''<<GetText(To)>> ''' || '''<<GetText(Content)>> ''' || '''<<GetText(Attachments)>> ''' ||\n"
00300                        )
00301 
00302         from_col = email_to_markup(request, msg['from_addr'])
00303         to_col = ' '.join([email_to_markup(request, (realname, mailaddr))
00304                            for realname, mailaddr in msg['target_addrs'] if not mailaddr in wiki_addrs])
00305         subj_col = '[[%s|%s]]' % (pagename, msg['subject'])
00306         date_col = msg['date']
00307         attach_col = " ".join(attachment_links)
00308         new_line = u'|| <<DateTime(%s)>> || %s || %s || %s || %s ||' % (date_col, from_col, to_col, subj_col, attach_col)
00309         if found_table is not None:
00310             content = "\n".join(old_content[:table_ends] + [new_line] + old_content[table_ends:])
00311         else:
00312             content = "\n".join(old_content) + table_header + new_line
00313 
00314         page = PageEditor(request, parent_page, do_editor_backup=0)
00315         page.saveText(content, 0, comment=comment)
00316 
00317 if __name__ == "__main__":
00318     if len(sys.argv) > 1:
00319         request_url = sys.argv[1]
00320     else:
00321         request_url = None
00322 
00323     from MoinMoin.web.contexts import ScriptContext
00324     request = ScriptContext(url=request_url)
00325 
00326     try:
00327         import_mail_from_file(request, infile)
00328     except ProcessingError, e:
00329         print >> sys.stderr, "An error occured while processing the message:", e.args
00330