Viewing file: __init__.py (10.33 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
# coding: utf-8 """
webencodings ~~~~~~~~~~~~
This is a Python implementation of the `WHATWG Encoding standard <http://encoding.spec.whatwg.org/>`. See README for details.
:copyright: Copyright 2012 by Simon Sapin :license: BSD, see LICENSE for details.
"""
from __future__ import unicode_literals
import codecs
from .labels import LABELS
VERSION = '0.5.1'
# Some names in Encoding are not valid Python aliases. Remap these. PYTHON_NAMES = { 'iso-8859-8-i': 'iso-8859-8', 'x-mac-cyrillic': 'mac-cyrillic', 'macintosh': 'mac-roman', 'windows-874': 'cp874'}
CACHE = {}
def ascii_lower(string): r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
:param string: An Unicode string. :returns: A new Unicode string.
This is used for `ASCII case-insensitive <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ matching of encoding labels. The same matching is also used, among other things, for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
This is different from the :meth:`~py:str.lower` method of Unicode strings which also affect non-ASCII characters, sometimes mapping them into the ASCII range:
>>> keyword = u'Bac\N{KELVIN SIGN}ground' >>> assert keyword.lower() == u'background' >>> assert ascii_lower(keyword) != keyword.lower() >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
""" # This turns out to be faster than unicode.translate() return string.encode('utf8').lower().decode('utf8')
def lookup(label): """ Look for an encoding by its label. This is the spec’s `get an encoding <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. Supported labels are listed there.
:param label: A string. :returns: An :class:`Encoding` object, or :obj:`None` for an unknown label.
""" # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. label = ascii_lower(label.strip('\t\n\f\r ')) name = LABELS.get(label) if name is None: return None encoding = CACHE.get(name) if encoding is None: if name == 'x-user-defined': from .x_user_defined import codec_info else: python_name = PYTHON_NAMES.get(name, name) # Any python_name value that gets to here should be valid. codec_info = codecs.lookup(python_name) encoding = Encoding(name, codec_info) CACHE[name] = encoding return encoding
def _get_encoding(encoding_or_label): """ Accept either an encoding object or label.
:param encoding: An :class:`Encoding` object or a label string. :returns: An :class:`Encoding` object. :raises: :exc:`~exceptions.LookupError` for an unknown label.
""" if hasattr(encoding_or_label, 'codec_info'): return encoding_or_label
encoding = lookup(encoding_or_label) if encoding is None: raise LookupError('Unknown encoding label: %r' % encoding_or_label) return encoding
class Encoding(object): """Reresents a character encoding such as UTF-8, that can be used for decoding or encoding.
.. attribute:: name
Canonical name of the encoding
.. attribute:: codec_info
The actual implementation of the encoding, a stdlib :class:`~codecs.CodecInfo` object. See :func:`codecs.register`.
""" def __init__(self, name, codec_info): self.name = name self.codec_info = codec_info
def __repr__(self): return '<Encoding %s>' % self.name
#: The UTF-8 encoding. Should be used for new content and formats. UTF8 = lookup('utf-8')
_UTF16LE = lookup('utf-16le') _UTF16BE = lookup('utf-16be')
def decode(input, fallback_encoding, errors='replace'): """ Decode a single string.
:param input: A byte string :param fallback_encoding: An :class:`Encoding` object or a label string. The encoding to use if :obj:`input` does note have a BOM. :param errors: Type of error handling. See :func:`codecs.register`. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. :return: A ``(output, encoding)`` tuple of an Unicode string and an :obj:`Encoding`.
""" # Fail early if `encoding` is an invalid label. fallback_encoding = _get_encoding(fallback_encoding) bom_encoding, input = _detect_bom(input) encoding = bom_encoding or fallback_encoding return encoding.codec_info.decode(input, errors)[0], encoding
def _detect_bom(input): """Return (bom_encoding, input), with any BOM removed from the input.""" if input.startswith(b'\xFF\xFE'): return _UTF16LE, input[2:] if input.startswith(b'\xFE\xFF'): return _UTF16BE, input[2:] if input.startswith(b'\xEF\xBB\xBF'): return UTF8, input[3:] return None, input
def encode(input, encoding=UTF8, errors='strict'): """ Encode a single string.
:param input: An Unicode string. :param encoding: An :class:`Encoding` object or a label string. :param errors: Type of error handling. See :func:`codecs.register`. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. :return: A byte string.
""" return _get_encoding(encoding).codec_info.encode(input, errors)[0]
def iter_decode(input, fallback_encoding, errors='replace'): """ "Pull"-based decoder.
:param input: An iterable of byte strings.
The input is first consumed just enough to determine the encoding based on the precense of a BOM, then consumed on demand when the return value is. :param fallback_encoding: An :class:`Encoding` object or a label string. The encoding to use if :obj:`input` does note have a BOM. :param errors: Type of error handling. See :func:`codecs.register`. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. :returns: An ``(output, encoding)`` tuple. :obj:`output` is an iterable of Unicode strings, :obj:`encoding` is the :obj:`Encoding` that is being used.
"""
decoder = IncrementalDecoder(fallback_encoding, errors) generator = _iter_decode_generator(input, decoder) encoding = next(generator) return generator, encoding
def _iter_decode_generator(input, decoder): """Return a generator that first yields the :obj:`Encoding`, then yields output chukns as Unicode strings.
""" decode = decoder.decode input = iter(input) for chunck in input: output = decode(chunck) if output: assert decoder.encoding is not None yield decoder.encoding yield output break else: # Input exhausted without determining the encoding output = decode(b'', final=True) assert decoder.encoding is not None yield decoder.encoding if output: yield output return
for chunck in input: output = decode(chunck) if output: yield output output = decode(b'', final=True) if output: yield output
def iter_encode(input, encoding=UTF8, errors='strict'): """ “Pull”-based encoder.
:param input: An iterable of Unicode strings. :param encoding: An :class:`Encoding` object or a label string. :param errors: Type of error handling. See :func:`codecs.register`. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. :returns: An iterable of byte strings.
""" # Fail early if `encoding` is an invalid label. encode = IncrementalEncoder(encoding, errors).encode return _iter_encode_generator(input, encode)
def _iter_encode_generator(input, encode): for chunck in input: output = encode(chunck) if output: yield output output = encode('', final=True) if output: yield output
class IncrementalDecoder(object): """ “Push”-based decoder.
:param fallback_encoding: An :class:`Encoding` object or a label string. The encoding to use if :obj:`input` does note have a BOM. :param errors: Type of error handling. See :func:`codecs.register`. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
""" def __init__(self, fallback_encoding, errors='replace'): # Fail early if `encoding` is an invalid label. self._fallback_encoding = _get_encoding(fallback_encoding) self._errors = errors self._buffer = b'' self._decoder = None #: The actual :class:`Encoding` that is being used, #: or :obj:`None` if that is not determined yet. #: (Ie. if there is not enough input yet to determine #: if there is a BOM.) self.encoding = None # Not known yet.
def decode(self, input, final=False): """Decode one chunk of the input.
:param input: A byte string. :param final: Indicate that no more input is available. Must be :obj:`True` if this is the last call. :returns: An Unicode string.
""" decoder = self._decoder if decoder is not None: return decoder(input, final)
input = self._buffer + input encoding, input = _detect_bom(input) if encoding is None: if len(input) < 3 and not final: # Not enough data yet. self._buffer = input return '' else: # No BOM encoding = self._fallback_encoding decoder = encoding.codec_info.incrementaldecoder(self._errors).decode self._decoder = decoder self.encoding = encoding return decoder(input, final)
class IncrementalEncoder(object): """ “Push”-based encoder.
:param encoding: An :class:`Encoding` object or a label string. :param errors: Type of error handling. See :func:`codecs.register`. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
.. method:: encode(input, final=False)
:param input: An Unicode string. :param final: Indicate that no more input is available. Must be :obj:`True` if this is the last call. :returns: A byte string.
""" def __init__(self, encoding=UTF8, errors='strict'): encoding = _get_encoding(encoding) self.encode = encoding.codec_info.incrementalencoder(errors).encode
|