Source code for encodings.utf_8_sig

""" Python 'utf-8-sig' Codec
This work similar to UTF-8 with the following changes:

* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
  first three bytes.

* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
  bytes will be skipped.
"""
import codecs

### Codec APIs

[docs]def encode(input, errors='strict'): return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
[docs]def decode(input, errors='strict'): prefix = 0 if input[:3] == codecs.BOM_UTF8: input = input[3:] prefix = 3 (output, consumed) = codecs.utf_8_decode(input, errors, True) return (output, consumed+prefix)
[docs]class IncrementalEncoder(codecs.IncrementalEncoder):
[docs] def __init__(self, errors='strict'): codecs.IncrementalEncoder.__init__(self, errors) self.first = 1
[docs] def encode(self, input, final=False): if self.first: self.first = 0 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0]
[docs] def reset(self): codecs.IncrementalEncoder.reset(self) self.first = 1
[docs] def getstate(self): return self.first
[docs] def setstate(self, state): self.first = state
[docs]class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
[docs] def __init__(self, errors='strict'): codecs.BufferedIncrementalDecoder.__init__(self, errors) self.first = True
[docs] def _buffer_decode(self, input, errors, final): if self.first: if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this really is a BOM # => try again on the next call return (u"", 0) else: self.first = None else: self.first = None if input[:3] == codecs.BOM_UTF8: (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) return (output, consumed+3) return codecs.utf_8_decode(input, errors, final)
[docs] def reset(self): codecs.BufferedIncrementalDecoder.reset(self) self.first = True
[docs]class StreamWriter(codecs.StreamWriter):
[docs] def reset(self): codecs.StreamWriter.reset(self) try: del self.encode except AttributeError: pass
[docs] def encode(self, input, errors='strict'): self.encode = codecs.utf_8_encode return encode(input, errors)
[docs]class StreamReader(codecs.StreamReader):
[docs] def reset(self): codecs.StreamReader.reset(self) try: del self.decode except AttributeError: pass
[docs] def decode(self, input, errors='strict'): if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this is a BOM # => try again on the next call return (u"", 0) elif input[:3] == codecs.BOM_UTF8: self.decode = codecs.utf_8_decode (output, consumed) = codecs.utf_8_decode(input[3:],errors) return (output, consumed+3) # (else) no BOM present self.decode = codecs.utf_8_decode return codecs.utf_8_decode(input, errors)
### encodings module API
[docs]def getregentry(): return codecs.CodecInfo( name='utf-8-sig', encode=encode, decode=decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, )