Back to index

plone3  3.1.7
encoding.py
Go to the documentation of this file.
00001 import re
00002 import encodings
00003 from Products.MimetypesRegistry.common import log
00004 
00005 EMACS_ENCODING_RGX = re.compile('[^#]*[#\s]*-\*-\s*coding: ([^\s]*)\s*-\*-\s*')
00006 VIM_ENCODING_RGX = re.compile('[^#]*[#\s]*vim:fileencoding=\s*([^\s]*)\s*')
00007 XML_ENCODING_RGX = re.compile('<\?xml version=[^\s]*\s*encoding=([^\s]*)\s*\?>')
00008 CHARSET_RGX = re.compile('charset=([^\s"]*)')
00009 
00010 def guess_encoding(buffer):
00011     """Better guess encoding method
00012     
00013     It checks if python supports the encoding
00014     """
00015     encoding = _guess_encoding(buffer)
00016     # step 1: if the encoding was detected, use the lower() because python
00017     # is using lower case names for encodings
00018     if encoding and isinstance(encoding, basestring):
00019         #encoding = encoding.lower()
00020         pass
00021     else:
00022         return None
00023     # try to find an encoding function for the encoding
00024     # if None is returned or an exception is raised the encoding is invalid
00025     try:
00026         result = encodings.search_function(encoding.lower())
00027     except:
00028         # XXX log
00029         result = None
00030     
00031     if result:
00032         # got a valid encoding
00033         return encoding
00034     else:
00035         return None
00036     
00037 
00038 def _guess_encoding(buffer):
00039     """try to guess encoding from a buffer
00040 
00041     FIXME: it could be mime type driven but it seems less painful like that
00042     """
00043     assert type(buffer) is type(''), type(buffer)
00044     # default to ascii on empty buffer
00045     if not buffer:
00046         return 'ascii'
00047 
00048     # check for UTF-8 byte-order mark
00049     if buffer.startswith('\xef\xbb\xbf'):
00050         return 'UTF-8'
00051 
00052     first_lines = buffer.split('\n')[:2]
00053     for line in first_lines:
00054         # check for emacs encoding declaration
00055         m = EMACS_ENCODING_RGX.match(line)
00056         if m is not None:
00057             return m.group(1)
00058         # check for vim encoding declaration
00059         m = VIM_ENCODING_RGX.match(line)
00060         if m is not None:
00061             return m.group(1)
00062 
00063     # check for xml encoding declaration
00064     if first_lines[0].startswith('<?xml'):
00065         m = XML_ENCODING_RGX.match(first_lines[0])
00066         if m is not None:
00067             return m.group(1)[1:-1]
00068         # xml files with no encoding declaration default to UTF-8
00069         return 'UTF-8'
00070 
00071     # try to get charset declaration
00072     # FIXME: should we check it's html before ?
00073     m = CHARSET_RGX.search(buffer)
00074     if m is not None:
00075         return m.group(1)
00076     return None