Back to index

python3.2  3.2.2
codecs.py
Go to the documentation of this file.
00001 """ codecs -- Python Codec Registry, API and helpers.
00002 
00003 
00004 Written by Marc-Andre Lemburg (mal@lemburg.com).
00005 
00006 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
00007 
00008 """#"
00009 
00010 import builtins, sys
00011 
00012 ### Registry and builtin stateless codec functions
00013 
00014 try:
00015     from _codecs import *
00016 except ImportError as why:
00017     raise SystemError('Failed to load the builtin codecs: %s' % why)
00018 
00019 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
00020            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
00021            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
00022            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
00023            "strict_errors", "ignore_errors", "replace_errors",
00024            "xmlcharrefreplace_errors",
00025            "register_error", "lookup_error"]
00026 
00027 ### Constants
00028 
00029 #
00030 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
00031 # and its possible byte string values
00032 # for UTF8/UTF16/UTF32 output and little/big endian machines
00033 #
00034 
00035 # UTF-8
00036 BOM_UTF8 = b'\xef\xbb\xbf'
00037 
00038 # UTF-16, little endian
00039 BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
00040 
00041 # UTF-16, big endian
00042 BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
00043 
00044 # UTF-32, little endian
00045 BOM_UTF32_LE = b'\xff\xfe\x00\x00'
00046 
00047 # UTF-32, big endian
00048 BOM_UTF32_BE = b'\x00\x00\xfe\xff'
00049 
00050 if sys.byteorder == 'little':
00051 
00052     # UTF-16, native endianness
00053     BOM = BOM_UTF16 = BOM_UTF16_LE
00054 
00055     # UTF-32, native endianness
00056     BOM_UTF32 = BOM_UTF32_LE
00057 
00058 else:
00059 
00060     # UTF-16, native endianness
00061     BOM = BOM_UTF16 = BOM_UTF16_BE
00062 
00063     # UTF-32, native endianness
00064     BOM_UTF32 = BOM_UTF32_BE
00065 
00066 # Old broken names (don't use in new code)
00067 BOM32_LE = BOM_UTF16_LE
00068 BOM32_BE = BOM_UTF16_BE
00069 BOM64_LE = BOM_UTF32_LE
00070 BOM64_BE = BOM_UTF32_BE
00071 
00072 
00073 ### Codec base classes (defining the API)
00074 
00075 class CodecInfo(tuple):
00076 
00077     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
00078         incrementalencoder=None, incrementaldecoder=None, name=None):
00079         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
00080         self.name = name
00081         self.encode = encode
00082         self.decode = decode
00083         self.incrementalencoder = incrementalencoder
00084         self.incrementaldecoder = incrementaldecoder
00085         self.streamwriter = streamwriter
00086         self.streamreader = streamreader
00087         return self
00088 
00089     def __repr__(self):
00090         return "<%s.%s object for encoding %s at 0x%x>" % \
00091                 (self.__class__.__module__, self.__class__.__name__,
00092                  self.name, id(self))
00093 
00094 class Codec:
00095 
00096     """ Defines the interface for stateless encoders/decoders.
00097 
00098         The .encode()/.decode() methods may use different error
00099         handling schemes by providing the errors argument. These
00100         string values are predefined:
00101 
00102          'strict' - raise a ValueError error (or a subclass)
00103          'ignore' - ignore the character and continue with the next
00104          'replace' - replace with a suitable replacement character;
00105                     Python will use the official U+FFFD REPLACEMENT
00106                     CHARACTER for the builtin Unicode codecs on
00107                     decoding and '?' on encoding.
00108          'xmlcharrefreplace' - Replace with the appropriate XML
00109                                character reference (only for encoding).
00110          'backslashreplace'  - Replace with backslashed escape sequences
00111                                (only for encoding).
00112 
00113         The set of allowed values can be extended via register_error.
00114 
00115     """
00116     def encode(self, input, errors='strict'):
00117 
00118         """ Encodes the object input and returns a tuple (output
00119             object, length consumed).
00120 
00121             errors defines the error handling to apply. It defaults to
00122             'strict' handling.
00123 
00124             The method may not store state in the Codec instance. Use
00125             StreamCodec for codecs which have to keep state in order to
00126             make encoding/decoding efficient.
00127 
00128             The encoder must be able to handle zero length input and
00129             return an empty object of the output object type in this
00130             situation.
00131 
00132         """
00133         raise NotImplementedError
00134 
00135     def decode(self, input, errors='strict'):
00136 
00137         """ Decodes the object input and returns a tuple (output
00138             object, length consumed).
00139 
00140             input must be an object which provides the bf_getreadbuf
00141             buffer slot. Python strings, buffer objects and memory
00142             mapped files are examples of objects providing this slot.
00143 
00144             errors defines the error handling to apply. It defaults to
00145             'strict' handling.
00146 
00147             The method may not store state in the Codec instance. Use
00148             StreamCodec for codecs which have to keep state in order to
00149             make encoding/decoding efficient.
00150 
00151             The decoder must be able to handle zero length input and
00152             return an empty object of the output object type in this
00153             situation.
00154 
00155         """
00156         raise NotImplementedError
00157 
00158 class IncrementalEncoder(object):
00159     """
00160     An IncrementalEncoder encodes an input in multiple steps. The input can
00161     be passed piece by piece to the encode() method. The IncrementalEncoder
00162     remembers the state of the encoding process between calls to encode().
00163     """
00164     def __init__(self, errors='strict'):
00165         """
00166         Creates an IncrementalEncoder instance.
00167 
00168         The IncrementalEncoder may use different error handling schemes by
00169         providing the errors keyword argument. See the module docstring
00170         for a list of possible values.
00171         """
00172         self.errors = errors
00173         self.buffer = ""
00174 
00175     def encode(self, input, final=False):
00176         """
00177         Encodes input and returns the resulting object.
00178         """
00179         raise NotImplementedError
00180 
00181     def reset(self):
00182         """
00183         Resets the encoder to the initial state.
00184         """
00185 
00186     def getstate(self):
00187         """
00188         Return the current state of the encoder.
00189         """
00190         return 0
00191 
00192     def setstate(self, state):
00193         """
00194         Set the current state of the encoder. state must have been
00195         returned by getstate().
00196         """
00197 
00198 class BufferedIncrementalEncoder(IncrementalEncoder):
00199     """
00200     This subclass of IncrementalEncoder can be used as the baseclass for an
00201     incremental encoder if the encoder must keep some of the output in a
00202     buffer between calls to encode().
00203     """
00204     def __init__(self, errors='strict'):
00205         IncrementalEncoder.__init__(self, errors)
00206         # unencoded input that is kept between calls to encode()
00207         self.buffer = ""
00208 
00209     def _buffer_encode(self, input, errors, final):
00210         # Overwrite this method in subclasses: It must encode input
00211         # and return an (output, length consumed) tuple
00212         raise NotImplementedError
00213 
00214     def encode(self, input, final=False):
00215         # encode input (taking the buffer into account)
00216         data = self.buffer + input
00217         (result, consumed) = self._buffer_encode(data, self.errors, final)
00218         # keep unencoded input until the next call
00219         self.buffer = data[consumed:]
00220         return result
00221 
00222     def reset(self):
00223         IncrementalEncoder.reset(self)
00224         self.buffer = ""
00225 
00226     def getstate(self):
00227         return self.buffer or 0
00228 
00229     def setstate(self, state):
00230         self.buffer = state or ""
00231 
00232 class IncrementalDecoder(object):
00233     """
00234     An IncrementalDecoder decodes an input in multiple steps. The input can
00235     be passed piece by piece to the decode() method. The IncrementalDecoder
00236     remembers the state of the decoding process between calls to decode().
00237     """
00238     def __init__(self, errors='strict'):
00239         """
00240         Create a IncrementalDecoder instance.
00241 
00242         The IncrementalDecoder may use different error handling schemes by
00243         providing the errors keyword argument. See the module docstring
00244         for a list of possible values.
00245         """
00246         self.errors = errors
00247 
00248     def decode(self, input, final=False):
00249         """
00250         Decode input and returns the resulting object.
00251         """
00252         raise NotImplementedError
00253 
00254     def reset(self):
00255         """
00256         Reset the decoder to the initial state.
00257         """
00258 
00259     def getstate(self):
00260         """
00261         Return the current state of the decoder.
00262 
00263         This must be a (buffered_input, additional_state_info) tuple.
00264         buffered_input must be a bytes object containing bytes that
00265         were passed to decode() that have not yet been converted.
00266         additional_state_info must be a non-negative integer
00267         representing the state of the decoder WITHOUT yet having
00268         processed the contents of buffered_input.  In the initial state
00269         and after reset(), getstate() must return (b"", 0).
00270         """
00271         return (b"", 0)
00272 
00273     def setstate(self, state):
00274         """
00275         Set the current state of the decoder.
00276 
00277         state must have been returned by getstate().  The effect of
00278         setstate((b"", 0)) must be equivalent to reset().
00279         """
00280 
00281 class BufferedIncrementalDecoder(IncrementalDecoder):
00282     """
00283     This subclass of IncrementalDecoder can be used as the baseclass for an
00284     incremental decoder if the decoder must be able to handle incomplete
00285     byte sequences.
00286     """
00287     def __init__(self, errors='strict'):
00288         IncrementalDecoder.__init__(self, errors)
00289         # undecoded input that is kept between calls to decode()
00290         self.buffer = b""
00291 
00292     def _buffer_decode(self, input, errors, final):
00293         # Overwrite this method in subclasses: It must decode input
00294         # and return an (output, length consumed) tuple
00295         raise NotImplementedError
00296 
00297     def decode(self, input, final=False):
00298         # decode input (taking the buffer into account)
00299         data = self.buffer + input
00300         (result, consumed) = self._buffer_decode(data, self.errors, final)
00301         # keep undecoded input until the next call
00302         self.buffer = data[consumed:]
00303         return result
00304 
00305     def reset(self):
00306         IncrementalDecoder.reset(self)
00307         self.buffer = b""
00308 
00309     def getstate(self):
00310         # additional state info is always 0
00311         return (self.buffer, 0)
00312 
00313     def setstate(self, state):
00314         # ignore additional state info
00315         self.buffer = state[0]
00316 
00317 #
00318 # The StreamWriter and StreamReader class provide generic working
00319 # interfaces which can be used to implement new encoding submodules
00320 # very easily. See encodings/utf_8.py for an example on how this is
00321 # done.
00322 #
00323 
00324 class StreamWriter(Codec):
00325 
00326     def __init__(self, stream, errors='strict'):
00327 
00328         """ Creates a StreamWriter instance.
00329 
00330             stream must be a file-like object open for writing
00331             (binary) data.
00332 
00333             The StreamWriter may use different error handling
00334             schemes by providing the errors keyword argument. These
00335             parameters are predefined:
00336 
00337              'strict' - raise a ValueError (or a subclass)
00338              'ignore' - ignore the character and continue with the next
00339              'replace'- replace with a suitable replacement character
00340              'xmlcharrefreplace' - Replace with the appropriate XML
00341                                    character reference.
00342              'backslashreplace'  - Replace with backslashed escape
00343                                    sequences (only for encoding).
00344 
00345             The set of allowed parameter values can be extended via
00346             register_error.
00347         """
00348         self.stream = stream
00349         self.errors = errors
00350 
00351     def write(self, object):
00352 
00353         """ Writes the object's contents encoded to self.stream.
00354         """
00355         data, consumed = self.encode(object, self.errors)
00356         self.stream.write(data)
00357 
00358     def writelines(self, list):
00359 
00360         """ Writes the concatenated list of strings to the stream
00361             using .write().
00362         """
00363         self.write(''.join(list))
00364 
00365     def reset(self):
00366 
00367         """ Flushes and resets the codec buffers used for keeping state.
00368 
00369             Calling this method should ensure that the data on the
00370             output is put into a clean state, that allows appending
00371             of new fresh data without having to rescan the whole
00372             stream to recover state.
00373 
00374         """
00375         pass
00376 
00377     def seek(self, offset, whence=0):
00378         self.stream.seek(offset, whence)
00379         if whence == 0 and offset == 0:
00380             self.reset()
00381 
00382     def __getattr__(self, name,
00383                     getattr=getattr):
00384 
00385         """ Inherit all other methods from the underlying stream.
00386         """
00387         return getattr(self.stream, name)
00388 
00389     def __enter__(self):
00390         return self
00391 
00392     def __exit__(self, type, value, tb):
00393         self.stream.close()
00394 
00395 ###
00396 
00397 class StreamReader(Codec):
00398 
00399     charbuffertype = str
00400 
00401     def __init__(self, stream, errors='strict'):
00402 
00403         """ Creates a StreamReader instance.
00404 
00405             stream must be a file-like object open for reading
00406             (binary) data.
00407 
00408             The StreamReader may use different error handling
00409             schemes by providing the errors keyword argument. These
00410             parameters are predefined:
00411 
00412              'strict' - raise a ValueError (or a subclass)
00413              'ignore' - ignore the character and continue with the next
00414              'replace'- replace with a suitable replacement character;
00415 
00416             The set of allowed parameter values can be extended via
00417             register_error.
00418         """
00419         self.stream = stream
00420         self.errors = errors
00421         self.bytebuffer = b""
00422         self._empty_charbuffer = self.charbuffertype()
00423         self.charbuffer = self._empty_charbuffer
00424         self.linebuffer = None
00425 
00426     def decode(self, input, errors='strict'):
00427         raise NotImplementedError
00428 
00429     def read(self, size=-1, chars=-1, firstline=False):
00430 
00431         """ Decodes data from the stream self.stream and returns the
00432             resulting object.
00433 
00434             chars indicates the number of characters to read from the
00435             stream. read() will never return more than chars
00436             characters, but it might return less, if there are not enough
00437             characters available.
00438 
00439             size indicates the approximate maximum number of bytes to
00440             read from the stream for decoding purposes. The decoder
00441             can modify this setting as appropriate. The default value
00442             -1 indicates to read and decode as much as possible.  size
00443             is intended to prevent having to decode huge files in one
00444             step.
00445 
00446             If firstline is true, and a UnicodeDecodeError happens
00447             after the first line terminator in the input only the first line
00448             will be returned, the rest of the input will be kept until the
00449             next call to read().
00450 
00451             The method should use a greedy read strategy meaning that
00452             it should read as much data as is allowed within the
00453             definition of the encoding and the given size, e.g.  if
00454             optional encoding endings or state markers are available
00455             on the stream, these should be read too.
00456         """
00457         # If we have lines cached, first merge them back into characters
00458         if self.linebuffer:
00459             self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
00460             self.linebuffer = None
00461 
00462         # read until we get the required number of characters (if available)
00463         while True:
00464             # can the request can be satisfied from the character buffer?
00465             if chars < 0:
00466                 if size < 0:
00467                     if self.charbuffer:
00468                         break
00469                 elif len(self.charbuffer) >= size:
00470                     break
00471             else:
00472                 if len(self.charbuffer) >= chars:
00473                     break
00474             # we need more data
00475             if size < 0:
00476                 newdata = self.stream.read()
00477             else:
00478                 newdata = self.stream.read(size)
00479             # decode bytes (those remaining from the last call included)
00480             data = self.bytebuffer + newdata
00481             try:
00482                 newchars, decodedbytes = self.decode(data, self.errors)
00483             except UnicodeDecodeError as exc:
00484                 if firstline:
00485                     newchars, decodedbytes = \
00486                         self.decode(data[:exc.start], self.errors)
00487                     lines = newchars.splitlines(True)
00488                     if len(lines)<=1:
00489                         raise
00490                 else:
00491                     raise
00492             # keep undecoded bytes until the next call
00493             self.bytebuffer = data[decodedbytes:]
00494             # put new characters in the character buffer
00495             self.charbuffer += newchars
00496             # there was no data available
00497             if not newdata:
00498                 break
00499         if chars < 0:
00500             # Return everything we've got
00501             result = self.charbuffer
00502             self.charbuffer = self._empty_charbuffer
00503         else:
00504             # Return the first chars characters
00505             result = self.charbuffer[:chars]
00506             self.charbuffer = self.charbuffer[chars:]
00507         return result
00508 
00509     def readline(self, size=None, keepends=True):
00510 
00511         """ Read one line from the input stream and return the
00512             decoded data.
00513 
00514             size, if given, is passed as size argument to the
00515             read() method.
00516 
00517         """
00518         # If we have lines cached from an earlier read, return
00519         # them unconditionally
00520         if self.linebuffer:
00521             line = self.linebuffer[0]
00522             del self.linebuffer[0]
00523             if len(self.linebuffer) == 1:
00524                 # revert to charbuffer mode; we might need more data
00525                 # next time
00526                 self.charbuffer = self.linebuffer[0]
00527                 self.linebuffer = None
00528             if not keepends:
00529                 line = line.splitlines(False)[0]
00530             return line
00531 
00532         readsize = size or 72
00533         line = self._empty_charbuffer
00534         # If size is given, we call read() only once
00535         while True:
00536             data = self.read(readsize, firstline=True)
00537             if data:
00538                 # If we're at a "\r" read one extra character (which might
00539                 # be a "\n") to get a proper line ending. If the stream is
00540                 # temporarily exhausted we return the wrong line ending.
00541                 if (isinstance(data, str) and data.endswith("\r")) or \
00542                    (isinstance(data, bytes) and data.endswith(b"\r")):
00543                     data += self.read(size=1, chars=1)
00544 
00545             line += data
00546             lines = line.splitlines(True)
00547             if lines:
00548                 if len(lines) > 1:
00549                     # More than one line result; the first line is a full line
00550                     # to return
00551                     line = lines[0]
00552                     del lines[0]
00553                     if len(lines) > 1:
00554                         # cache the remaining lines
00555                         lines[-1] += self.charbuffer
00556                         self.linebuffer = lines
00557                         self.charbuffer = None
00558                     else:
00559                         # only one remaining line, put it back into charbuffer
00560                         self.charbuffer = lines[0] + self.charbuffer
00561                     if not keepends:
00562                         line = line.splitlines(False)[0]
00563                     break
00564                 line0withend = lines[0]
00565                 line0withoutend = lines[0].splitlines(False)[0]
00566                 if line0withend != line0withoutend: # We really have a line end
00567                     # Put the rest back together and keep it until the next call
00568                     self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
00569                                       self.charbuffer
00570                     if keepends:
00571                         line = line0withend
00572                     else:
00573                         line = line0withoutend
00574                     break
00575             # we didn't get anything or this was our only try
00576             if not data or size is not None:
00577                 if line and not keepends:
00578                     line = line.splitlines(False)[0]
00579                 break
00580             if readsize < 8000:
00581                 readsize *= 2
00582         return line
00583 
00584     def readlines(self, sizehint=None, keepends=True):
00585 
00586         """ Read all lines available on the input stream
00587             and return them as list of lines.
00588 
00589             Line breaks are implemented using the codec's decoder
00590             method and are included in the list entries.
00591 
00592             sizehint, if given, is ignored since there is no efficient
00593             way to finding the true end-of-line.
00594 
00595         """
00596         data = self.read()
00597         return data.splitlines(keepends)
00598 
00599     def reset(self):
00600 
00601         """ Resets the codec buffers used for keeping state.
00602 
00603             Note that no stream repositioning should take place.
00604             This method is primarily intended to be able to recover
00605             from decoding errors.
00606 
00607         """
00608         self.bytebuffer = b""
00609         self.charbuffer = self._empty_charbuffer
00610         self.linebuffer = None
00611 
00612     def seek(self, offset, whence=0):
00613         """ Set the input stream's current position.
00614 
00615             Resets the codec buffers used for keeping state.
00616         """
00617         self.stream.seek(offset, whence)
00618         self.reset()
00619 
00620     def __next__(self):
00621 
00622         """ Return the next decoded line from the input stream."""
00623         line = self.readline()
00624         if line:
00625             return line
00626         raise StopIteration
00627 
00628     def __iter__(self):
00629         return self
00630 
00631     def __getattr__(self, name,
00632                     getattr=getattr):
00633 
00634         """ Inherit all other methods from the underlying stream.
00635         """
00636         return getattr(self.stream, name)
00637 
00638     def __enter__(self):
00639         return self
00640 
00641     def __exit__(self, type, value, tb):
00642         self.stream.close()
00643 
00644 ###
00645 
00646 class StreamReaderWriter:
00647 
00648     """ StreamReaderWriter instances allow wrapping streams which
00649         work in both read and write modes.
00650 
00651         The design is such that one can use the factory functions
00652         returned by the codec.lookup() function to construct the
00653         instance.
00654 
00655     """
00656     # Optional attributes set by the file wrappers below
00657     encoding = 'unknown'
00658 
00659     def __init__(self, stream, Reader, Writer, errors='strict'):
00660 
00661         """ Creates a StreamReaderWriter instance.
00662 
00663             stream must be a Stream-like object.
00664 
00665             Reader, Writer must be factory functions or classes
00666             providing the StreamReader, StreamWriter interface resp.
00667 
00668             Error handling is done in the same way as defined for the
00669             StreamWriter/Readers.
00670 
00671         """
00672         self.stream = stream
00673         self.reader = Reader(stream, errors)
00674         self.writer = Writer(stream, errors)
00675         self.errors = errors
00676 
00677     def read(self, size=-1):
00678 
00679         return self.reader.read(size)
00680 
00681     def readline(self, size=None):
00682 
00683         return self.reader.readline(size)
00684 
00685     def readlines(self, sizehint=None):
00686 
00687         return self.reader.readlines(sizehint)
00688 
00689     def __next__(self):
00690 
00691         """ Return the next decoded line from the input stream."""
00692         return next(self.reader)
00693 
00694     def __iter__(self):
00695         return self
00696 
00697     def write(self, data):
00698 
00699         return self.writer.write(data)
00700 
00701     def writelines(self, list):
00702 
00703         return self.writer.writelines(list)
00704 
00705     def reset(self):
00706 
00707         self.reader.reset()
00708         self.writer.reset()
00709 
00710     def seek(self, offset, whence=0):
00711         self.stream.seek(offset, whence)
00712         self.reader.reset()
00713         if whence == 0 and offset == 0:
00714             self.writer.reset()
00715 
00716     def __getattr__(self, name,
00717                     getattr=getattr):
00718 
00719         """ Inherit all other methods from the underlying stream.
00720         """
00721         return getattr(self.stream, name)
00722 
00723     # these are needed to make "with codecs.open(...)" work properly
00724 
00725     def __enter__(self):
00726         return self
00727 
00728     def __exit__(self, type, value, tb):
00729         self.stream.close()
00730 
00731 ###
00732 
00733 class StreamRecoder:
00734 
00735     """ StreamRecoder instances provide a frontend - backend
00736         view of encoding data.
00737 
00738         They use the complete set of APIs returned by the
00739         codecs.lookup() function to implement their task.
00740 
00741         Data written to the stream is first decoded into an
00742         intermediate format (which is dependent on the given codec
00743         combination) and then written to the stream using an instance
00744         of the provided Writer class.
00745 
00746         In the other direction, data is read from the stream using a
00747         Reader instance and then return encoded data to the caller.
00748 
00749     """
00750     # Optional attributes set by the file wrappers below
00751     data_encoding = 'unknown'
00752     file_encoding = 'unknown'
00753 
00754     def __init__(self, stream, encode, decode, Reader, Writer,
00755                  errors='strict'):
00756 
00757         """ Creates a StreamRecoder instance which implements a two-way
00758             conversion: encode and decode work on the frontend (the
00759             input to .read() and output of .write()) while
00760             Reader and Writer work on the backend (reading and
00761             writing to the stream).
00762 
00763             You can use these objects to do transparent direct
00764             recodings from e.g. latin-1 to utf-8 and back.
00765 
00766             stream must be a file-like object.
00767 
00768             encode, decode must adhere to the Codec interface, Reader,
00769             Writer must be factory functions or classes providing the
00770             StreamReader, StreamWriter interface resp.
00771 
00772             encode and decode are needed for the frontend translation,
00773             Reader and Writer for the backend translation. Unicode is
00774             used as intermediate encoding.
00775 
00776             Error handling is done in the same way as defined for the
00777             StreamWriter/Readers.
00778 
00779         """
00780         self.stream = stream
00781         self.encode = encode
00782         self.decode = decode
00783         self.reader = Reader(stream, errors)
00784         self.writer = Writer(stream, errors)
00785         self.errors = errors
00786 
00787     def read(self, size=-1):
00788 
00789         data = self.reader.read(size)
00790         data, bytesencoded = self.encode(data, self.errors)
00791         return data
00792 
00793     def readline(self, size=None):
00794 
00795         if size is None:
00796             data = self.reader.readline()
00797         else:
00798             data = self.reader.readline(size)
00799         data, bytesencoded = self.encode(data, self.errors)
00800         return data
00801 
00802     def readlines(self, sizehint=None):
00803 
00804         data = self.reader.read()
00805         data, bytesencoded = self.encode(data, self.errors)
00806         return data.splitlines(1)
00807 
00808     def __next__(self):
00809 
00810         """ Return the next decoded line from the input stream."""
00811         data = next(self.reader)
00812         data, bytesencoded = self.encode(data, self.errors)
00813         return data
00814 
00815     def __iter__(self):
00816         return self
00817 
00818     def write(self, data):
00819 
00820         data, bytesdecoded = self.decode(data, self.errors)
00821         return self.writer.write(data)
00822 
00823     def writelines(self, list):
00824 
00825         data = ''.join(list)
00826         data, bytesdecoded = self.decode(data, self.errors)
00827         return self.writer.write(data)
00828 
00829     def reset(self):
00830 
00831         self.reader.reset()
00832         self.writer.reset()
00833 
00834     def __getattr__(self, name,
00835                     getattr=getattr):
00836 
00837         """ Inherit all other methods from the underlying stream.
00838         """
00839         return getattr(self.stream, name)
00840 
00841     def __enter__(self):
00842         return self
00843 
00844     def __exit__(self, type, value, tb):
00845         self.stream.close()
00846 
00847 ### Shortcuts
00848 
00849 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
00850 
00851     """ Open an encoded file using the given mode and return
00852         a wrapped version providing transparent encoding/decoding.
00853 
00854         Note: The wrapped version will only accept the object format
00855         defined by the codecs, i.e. Unicode objects for most builtin
00856         codecs. Output is also codec dependent and will usually be
00857         Unicode as well.
00858 
00859         Files are always opened in binary mode, even if no binary mode
00860         was specified. This is done to avoid data loss due to encodings
00861         using 8-bit values. The default file mode is 'rb' meaning to
00862         open the file in binary read mode.
00863 
00864         encoding specifies the encoding which is to be used for the
00865         file.
00866 
00867         errors may be given to define the error handling. It defaults
00868         to 'strict' which causes ValueErrors to be raised in case an
00869         encoding error occurs.
00870 
00871         buffering has the same meaning as for the builtin open() API.
00872         It defaults to line buffered.
00873 
00874         The returned wrapped file object provides an extra attribute
00875         .encoding which allows querying the used encoding. This
00876         attribute is only available if an encoding was specified as
00877         parameter.
00878 
00879     """
00880     if encoding is not None and \
00881        'b' not in mode:
00882         # Force opening of the file in binary mode
00883         mode = mode + 'b'
00884     file = builtins.open(filename, mode, buffering)
00885     if encoding is None:
00886         return file
00887     info = lookup(encoding)
00888     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
00889     # Add attributes to simplify introspection
00890     srw.encoding = encoding
00891     return srw
00892 
00893 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
00894 
00895     """ Return a wrapped version of file which provides transparent
00896         encoding translation.
00897 
00898         Strings written to the wrapped file are interpreted according
00899         to the given data_encoding and then written to the original
00900         file as string using file_encoding. The intermediate encoding
00901         will usually be Unicode but depends on the specified codecs.
00902 
00903         Strings are read from the file using file_encoding and then
00904         passed back to the caller as string using data_encoding.
00905 
00906         If file_encoding is not given, it defaults to data_encoding.
00907 
00908         errors may be given to define the error handling. It defaults
00909         to 'strict' which causes ValueErrors to be raised in case an
00910         encoding error occurs.
00911 
00912         The returned wrapped file object provides two extra attributes
00913         .data_encoding and .file_encoding which reflect the given
00914         parameters of the same name. The attributes can be used for
00915         introspection by Python programs.
00916 
00917     """
00918     if file_encoding is None:
00919         file_encoding = data_encoding
00920     data_info = lookup(data_encoding)
00921     file_info = lookup(file_encoding)
00922     sr = StreamRecoder(file, data_info.encode, data_info.decode,
00923                        file_info.streamreader, file_info.streamwriter, errors)
00924     # Add attributes to simplify introspection
00925     sr.data_encoding = data_encoding
00926     sr.file_encoding = file_encoding
00927     return sr
00928 
00929 ### Helpers for codec lookup
00930 
00931 def getencoder(encoding):
00932 
00933     """ Lookup up the codec for the given encoding and return
00934         its encoder function.
00935 
00936         Raises a LookupError in case the encoding cannot be found.
00937 
00938     """
00939     return lookup(encoding).encode
00940 
00941 def getdecoder(encoding):
00942 
00943     """ Lookup up the codec for the given encoding and return
00944         its decoder function.
00945 
00946         Raises a LookupError in case the encoding cannot be found.
00947 
00948     """
00949     return lookup(encoding).decode
00950 
00951 def getincrementalencoder(encoding):
00952 
00953     """ Lookup up the codec for the given encoding and return
00954         its IncrementalEncoder class or factory function.
00955 
00956         Raises a LookupError in case the encoding cannot be found
00957         or the codecs doesn't provide an incremental encoder.
00958 
00959     """
00960     encoder = lookup(encoding).incrementalencoder
00961     if encoder is None:
00962         raise LookupError(encoding)
00963     return encoder
00964 
00965 def getincrementaldecoder(encoding):
00966 
00967     """ Lookup up the codec for the given encoding and return
00968         its IncrementalDecoder class or factory function.
00969 
00970         Raises a LookupError in case the encoding cannot be found
00971         or the codecs doesn't provide an incremental decoder.
00972 
00973     """
00974     decoder = lookup(encoding).incrementaldecoder
00975     if decoder is None:
00976         raise LookupError(encoding)
00977     return decoder
00978 
00979 def getreader(encoding):
00980 
00981     """ Lookup up the codec for the given encoding and return
00982         its StreamReader class or factory function.
00983 
00984         Raises a LookupError in case the encoding cannot be found.
00985 
00986     """
00987     return lookup(encoding).streamreader
00988 
00989 def getwriter(encoding):
00990 
00991     """ Lookup up the codec for the given encoding and return
00992         its StreamWriter class or factory function.
00993 
00994         Raises a LookupError in case the encoding cannot be found.
00995 
00996     """
00997     return lookup(encoding).streamwriter
00998 
00999 def iterencode(iterator, encoding, errors='strict', **kwargs):
01000     """
01001     Encoding iterator.
01002 
01003     Encodes the input strings from the iterator using a IncrementalEncoder.
01004 
01005     errors and kwargs are passed through to the IncrementalEncoder
01006     constructor.
01007     """
01008     encoder = getincrementalencoder(encoding)(errors, **kwargs)
01009     for input in iterator:
01010         output = encoder.encode(input)
01011         if output:
01012             yield output
01013     output = encoder.encode("", True)
01014     if output:
01015         yield output
01016 
01017 def iterdecode(iterator, encoding, errors='strict', **kwargs):
01018     """
01019     Decoding iterator.
01020 
01021     Decodes the input strings from the iterator using a IncrementalDecoder.
01022 
01023     errors and kwargs are passed through to the IncrementalDecoder
01024     constructor.
01025     """
01026     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
01027     for input in iterator:
01028         output = decoder.decode(input)
01029         if output:
01030             yield output
01031     output = decoder.decode(b"", True)
01032     if output:
01033         yield output
01034 
01035 ### Helpers for charmap-based codecs
01036 
01037 def make_identity_dict(rng):
01038 
01039     """ make_identity_dict(rng) -> dict
01040 
01041         Return a dictionary where elements of the rng sequence are
01042         mapped to themselves.
01043 
01044     """
01045     res = {}
01046     for i in rng:
01047         res[i]=i
01048     return res
01049 
01050 def make_encoding_map(decoding_map):
01051 
01052     """ Creates an encoding map from a decoding map.
01053 
01054         If a target mapping in the decoding map occurs multiple
01055         times, then that target is mapped to None (undefined mapping),
01056         causing an exception when encountered by the charmap codec
01057         during translation.
01058 
01059         One example where this happens is cp875.py which decodes
01060         multiple character to \u001a.
01061 
01062     """
01063     m = {}
01064     for k,v in decoding_map.items():
01065         if not v in m:
01066             m[v] = k
01067         else:
01068             m[v] = None
01069     return m
01070 
01071 ### error handlers
01072 
01073 try:
01074     strict_errors = lookup_error("strict")
01075     ignore_errors = lookup_error("ignore")
01076     replace_errors = lookup_error("replace")
01077     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
01078     backslashreplace_errors = lookup_error("backslashreplace")
01079 except LookupError:
01080     # In --disable-unicode builds, these error handler are missing
01081     strict_errors = None
01082     ignore_errors = None
01083     replace_errors = None
01084     xmlcharrefreplace_errors = None
01085     backslashreplace_errors = None
01086 
01087 # Tell modulefinder that using codecs probably needs the encodings
01088 # package
01089 _false = 0
01090 if _false:
01091     import encodings
01092 
01093 ### Tests
01094 
01095 if __name__ == '__main__':
01096 
01097     # Make stdout translate Latin-1 output into UTF-8 output
01098     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
01099 
01100     # Have stdin translate Latin-1 input into UTF-8 input
01101     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')