Back to index

python3.2  3.2.2
utf_8_sig.py
Go to the documentation of this file.
00001 """ Python 'utf-8-sig' Codec
00002 This work similar to UTF-8 with the following changes:
00003 
00004 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
00005   first three bytes.
00006 
00007 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
00008   bytes will be skipped.
00009 """
00010 import codecs
00011 
00012 ### Codec APIs
00013 
00014 def encode(input, errors='strict'):
00015     return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
00016             len(input))
00017 
00018 def decode(input, errors='strict'):
00019     prefix = 0
00020     if input[:3] == codecs.BOM_UTF8:
00021         input = input[3:]
00022         prefix = 3
00023     (output, consumed) = codecs.utf_8_decode(input, errors, True)
00024     return (output, consumed+prefix)
00025 
00026 class IncrementalEncoder(codecs.IncrementalEncoder):
00027     def __init__(self, errors='strict'):
00028         codecs.IncrementalEncoder.__init__(self, errors)
00029         self.first = 1
00030 
00031     def encode(self, input, final=False):
00032         if self.first:
00033             self.first = 0
00034             return codecs.BOM_UTF8 + \
00035                    codecs.utf_8_encode(input, self.errors)[0]
00036         else:
00037             return codecs.utf_8_encode(input, self.errors)[0]
00038 
00039     def reset(self):
00040         codecs.IncrementalEncoder.reset(self)
00041         self.first = 1
00042 
00043     def getstate(self):
00044         return self.first
00045 
00046     def setstate(self, state):
00047         self.first = state
00048 
00049 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
00050     def __init__(self, errors='strict'):
00051         codecs.BufferedIncrementalDecoder.__init__(self, errors)
00052         self.first = 1
00053 
00054     def _buffer_decode(self, input, errors, final):
00055         if self.first:
00056             if len(input) < 3:
00057                 if codecs.BOM_UTF8.startswith(input):
00058                     # not enough data to decide if this really is a BOM
00059                     # => try again on the next call
00060                     return ("", 0)
00061                 else:
00062                     self.first = 0
00063             else:
00064                 self.first = 0
00065                 if input[:3] == codecs.BOM_UTF8:
00066                     (output, consumed) = \
00067                        codecs.utf_8_decode(input[3:], errors, final)
00068                     return (output, consumed+3)
00069         return codecs.utf_8_decode(input, errors, final)
00070 
00071     def reset(self):
00072         codecs.BufferedIncrementalDecoder.reset(self)
00073         self.first = 1
00074 
00075     def getstate(self):
00076         state = codecs.BufferedIncrementalDecoder.getstate(self)
00077         # state[1] must be 0 here, as it isn't passed along to the caller
00078         return (state[0], self.first)
00079 
00080     def setstate(self, state):
00081         # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
00082         codecs.BufferedIncrementalDecoder.setstate(self, state)
00083         self.first = state[1]
00084 
00085 class StreamWriter(codecs.StreamWriter):
00086     def reset(self):
00087         codecs.StreamWriter.reset(self)
00088         try:
00089             del self.encode
00090         except AttributeError:
00091             pass
00092 
00093     def encode(self, input, errors='strict'):
00094         self.encode = codecs.utf_8_encode
00095         return encode(input, errors)
00096 
00097 class StreamReader(codecs.StreamReader):
00098     def reset(self):
00099         codecs.StreamReader.reset(self)
00100         try:
00101             del self.decode
00102         except AttributeError:
00103             pass
00104 
00105     def decode(self, input, errors='strict'):
00106         if len(input) < 3:
00107             if codecs.BOM_UTF8.startswith(input):
00108                 # not enough data to decide if this is a BOM
00109                 # => try again on the next call
00110                 return ("", 0)
00111         elif input[:3] == codecs.BOM_UTF8:
00112             self.decode = codecs.utf_8_decode
00113             (output, consumed) = codecs.utf_8_decode(input[3:],errors)
00114             return (output, consumed+3)
00115         # (else) no BOM present
00116         self.decode = codecs.utf_8_decode
00117         return codecs.utf_8_decode(input, errors)
00118 
00119 ### encodings module API
00120 
00121 def getregentry():
00122     return codecs.CodecInfo(
00123         name='utf-8-sig',
00124         encode=encode,
00125         decode=decode,
00126         incrementalencoder=IncrementalEncoder,
00127         incrementaldecoder=IncrementalDecoder,
00128         streamreader=StreamReader,
00129         streamwriter=StreamWriter,
00130     )