aio-libs · Dreamsorcerer · Sep 7, 2023 · Aug 29, 2023
@@ -35,9 +35,6 @@ ignore_missing_imports = True
 [mypy-brotli]
 ignore_missing_imports = True
 
-[mypy-cchardet]
-ignore_missing_imports = True
-
 [mypy-gunicorn.*]
 ignore_missing_imports = True
 

@@ -0,0 +1,2 @@
+Replace automatic character set detection with a `fallback_charset_resolver` parameter
+in `ClientSession` to allow user-supplied character set detection functions.
@@ -169,6 +169,7 @@ Jesus Cea
 Jian Zeng
 Jinkyu Yi
 Joel Watts
+John Parton
 Jon Nabozny
 Jonas Krüger Svensson
 Jonas Obrist

@@ -159,22 +159,18 @@ Requirements
 
 - async-timeout_
 - attrs_
-- charset-normalizer_
 - multidict_
 - yarl_
 - frozenlist_
 
-Optionally you may install the cChardet_ and aiodns_ libraries (highly
-recommended for sake of speed).
+Optionally you may install the aiodns_ library (highly recommended for sake of speed).
 
-.. _charset-normalizer: https://pypi.org/project/charset-normalizer
 .. _aiodns: https://pypi.python.org/pypi/aiodns
 .. _attrs: https://github.com/python-attrs/attrs
 .. _multidict: https://pypi.python.org/pypi/multidict
 .. _frozenlist: https://pypi.org/project/frozenlist/
 .. _yarl: https://pypi.python.org/pypi/yarl
 .. _async-timeout: https://pypi.python.org/pypi/async_timeout
-.. _cChardet: https://pypi.python.org/pypi/cchardet
 
 License
 =======

@@ -163,6 +163,7 @@ class ClientTimeout:
 DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)
 
 _RetType = TypeVar("_RetType")
+_CharsetResolver = Callable[[ClientResponse, bytes], str]
 
 
 class ClientSession:
@@ -194,6 +195,7 @@ class ClientSession:
             "_read_bufsize",
             "_max_line_size",
             "_max_field_size",
+            "_resolve_charset",
         ]
     )
 
@@ -230,6 +232,7 @@ def __init__(
         read_bufsize: int = 2**16,
         max_line_size: int = 8190,
         max_field_size: int = 8190,
+        fallback_charset_resolver: _CharsetResolver = lambda r, b: "utf-8",
     ) -> None:
         if loop is None:
             if connector is not None:
@@ -325,6 +328,8 @@ def __init__(
         for trace_config in self._trace_configs:
             trace_config.freeze()
 
+        self._resolve_charset = fallback_charset_resolver
+
     def __init_subclass__(cls: Type["ClientSession"]) -> None:
         warnings.warn(
             "Inheritance class {} from ClientSession "

@@ -13,6 +13,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    Callable,
     Dict,
     Iterable,
     List,
@@ -70,11 +71,6 @@
     ssl = None  # type: ignore[assignment]
     SSLContext = object  # type: ignore[misc,assignment]
 
-try:
-    import cchardet as chardet
-except ImportError:  # pragma: no cover
-    import charset_normalizer as chardet
-
 
 __all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")
 
@@ -742,8 +738,8 @@ class ClientResponse(HeadersMixin):
     _raw_headers: RawHeaders = None  # type: ignore[assignment]
 
     _connection = None  # current connection
-    _source_traceback = None
-    # setted up by ClientRequest after ClientResponse object creation
+    _source_traceback: Optional[traceback.StackSummary] = None
+    # set up by ClientRequest after ClientResponse object creation
     # post-init stage allows to not change ctor signature
     _closed = True  # to allow __del__ for non-initialized properly response
     _released = False
@@ -780,6 +776,15 @@ def __init__(
         self._loop = loop
         # store a reference to session #1985
         self._session: Optional[ClientSession] = session
+        # Save reference to _resolve_charset, so that get_encoding() will still
+        # work after the response has finished reading the body.
+        if session is None:
+            # TODO: Fix session=None in tests (see ClientRequest.__init__).
+            self._resolve_charset: Callable[
+                ["ClientResponse", bytes], str
+            ] = lambda *_: "utf-8"
+        else:
+            self._resolve_charset = session._resolve_charset
         if loop.get_debug():
             self._source_traceback = traceback.extract_stack(sys._getframe(1))
 
@@ -1070,27 +1075,22 @@ def get_encoding(self) -> str:
 
         encoding = mimetype.parameters.get("charset")
         if encoding:
-            try:
-                codecs.lookup(encoding)
-            except LookupError:
-                encoding = None
-        if not encoding:
-            if mimetype.type == "application" and (
-                mimetype.subtype == "json" or mimetype.subtype == "rdap"
-            ):
-                # RFC 7159 states that the default encoding is UTF-8.
-                # RFC 7483 defines application/rdap+json
-                encoding = "utf-8"
-            elif self._body is None:
-                raise RuntimeError(
-                    "Cannot guess the encoding of " "a not yet read body"
-                )
-            else:
-                encoding = chardet.detect(self._body)["encoding"]
-        if not encoding:
-            encoding = "utf-8"
+            with contextlib.suppress(LookupError):
+                return codecs.lookup(encoding).name
+
+        if mimetype.type == "application" and (
+            mimetype.subtype == "json" or mimetype.subtype == "rdap"
+        ):
+            # RFC 7159 states that the default encoding is UTF-8.
+            # RFC 7483 defines application/rdap+json
+            return "utf-8"
+
+        if self._body is None:
+            raise RuntimeError(
+                "Cannot compute fallback encoding of a not yet read body"
+            )
 
-        return encoding
+        return self._resolve_charset(self, self._body)
 
     async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
         """Read response payload and decode."""

@@ -653,3 +653,33 @@ are changed so that aiohttp itself can wait on the underlying
 connection to close. Please follow issue `#1925
 <https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
 on this.
+
+
+Character Set Detection
+-----------------------
+
+If you encounter a :exc:`UnicodeDecodeError` when using :meth:`ClientResponse.text()`
+this may be because the response does not include the charset needed
+to decode the body.
+
+If you know the correct encoding for a request, you can simply specify
+the encoding as a parameter (e.g. ``resp.text("windows-1252")``).
+
+Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
+can be used to introduce charset guessing functionality. When a charset is not found
+in the Content-Type header, this function will be called to get the charset encoding. For
+example, this can be used with the ``chardetng_py`` library.::
+
+    from chardetng_py import detect
+
+    def charset_resolver(resp: ClientResponse, body: bytes) -> str:
+        tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
+        return detect(body, allow_utf8=True, tld=tld)
+
+    ClientSession(fallback_charset_resolver=charset_resolver)
+
+Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::
+
+    from charset_normalizer import detect
+
+    ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")
@@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
                          read_bufsize=2**16, \
                          requote_redirect_url=True, \
                          trust_env=False, \
-                         trace_configs=None)
+                         trace_configs=None, \
+                         fallback_charset_resolver=lambda r, b: "utf-8")
 
    The class for creating client sessions and making requests.
 
@@ -226,6 +227,16 @@ The client session supports the context manager protocol for self closing.
                          disabling.  See :ref:`aiohttp-client-tracing-reference` for
                          more information.
 
+   :param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
+      A :term:`callable` that accepts a :class:`ClientResponse` and the
+      :class:`bytes` contents, and returns a :class:`str` which will be used as
+      the encoding parameter to :meth:`bytes.decode()`.
+
+      This function will be called when the charset is not known (e.g. not specified in the
+      Content-Type header). The default function simply defaults to ``utf-8``.
+
+      .. versionadded:: 3.8.6
+
    .. attribute:: closed
 
       ``True`` if the session has been closed, ``False`` otherwise.
@@ -1424,12 +1435,8 @@ Response object
       Read response's body and return decoded :class:`str` using
       specified *encoding* parameter.
 
-      If *encoding* is ``None`` content encoding is autocalculated
-      using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
-      header is not provided by server.
-
-      :term:`cchardet` is used with fallback to :term:`charset-normalizer` if
-      *cchardet* is not available.
+      If *encoding* is ``None`` content encoding is determined from the
+      Content-Type header, or using the ``fallback_charset_resolver`` function.
 
       Close underlying connection if data reading gets an error,
       release connection otherwise.
@@ -1438,35 +1445,21 @@ Response object
                            ``None`` for encoding autodetection
                            (default).
 
-      :return str: decoded *BODY*
-
-      :raise LookupError: if the encoding detected by cchardet is
-                          unknown by Python (e.g. VISCII).
 
-      .. note::
+      :raises: :exc:`UnicodeDecodeError` if decoding fails. See also
+               :meth:`get_encoding`.
 
-         If response has no ``charset`` info in ``Content-Type`` HTTP
-         header :term:`cchardet` / :term:`charset-normalizer` is used for
-         content encoding autodetection.
-
-         It may hurt performance. If page encoding is known passing
-         explicit *encoding* parameter might help::
-
-            await resp.text('ISO-8859-1')
+      :return str: decoded *BODY*
 
    .. method:: json(*, encoding=None, loads=json.loads, \
                       content_type='application/json')
       :async:
 
       Read response's body as *JSON*, return :class:`dict` using
       specified *encoding* and *loader*. If data is not still available
-      a ``read`` call will be done,
+      a ``read`` call will be done.
 
-      If *encoding* is ``None`` content encoding is autocalculated
-      using :term:`cchardet` or :term:`charset-normalizer` as fallback if
-      *cchardet* is not available.
-
-      if response's `content-type` does not match `content_type` parameter
+      If response's `content-type` does not match `content_type` parameter
       :exc:`aiohttp.ContentTypeError` get raised.
       To disable content type check pass ``None`` value.
 
@@ -1498,17 +1491,9 @@ Response object
 
    .. method:: get_encoding()
 
-      Automatically detect content encoding using ``charset`` info in
-      ``Content-Type`` HTTP header. If this info is not exists or there
-      are no appropriate codecs for encoding then :term:`cchardet` /
-      :term:`charset-normalizer` is used.
-
-      Beware that it is not always safe to use the result of this function to
-      decode a response. Some encodings detected by cchardet are not known by
-      Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.
-
-      :raise RuntimeError: if called before the body has been read,
-                           for :term:`cchardet` usage
+      Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
+      If no charset is present or the charset is not understood by Python, the
+      ``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.
 
       .. versionadded:: 3.0
 

@@ -45,22 +45,6 @@
       Any object that can be called. Use :func:`callable` to check
       that.
 
-   charset-normalizer
-
-       The Real First Universal Charset Detector.
-       Open, modern and actively maintained alternative to Chardet.
-
-       https://pypi.org/project/charset-normalizer/
-
-   cchardet
-
-       cChardet is high speed universal character encoding detector -
-       binding to charsetdetect.
-
-       https://pypi.python.org/pypi/cchardet/
-
-       .. include:: _snippets/cchardet-unmaintained-admonition.rst
-
    gunicorn
 
        Gunicorn 'Green Unicorn' is a Python WSGI HTTP Server for

@@ -33,15 +33,6 @@ Library Installation
 
    $ pip install aiohttp
 
-You may want to install *optional* :term:`cchardet` library as faster
-replacement for :term:`charset-normalizer`:
-
-.. code-block:: bash
-
-   $ pip install cchardet
-
-.. include:: _snippets/cchardet-unmaintained-admonition.rst
-
 For speeding up DNS resolving by client API you may install
 :term:`aiodns` as well.
 This option is highly recommended:
@@ -53,9 +44,9 @@ This option is highly recommended:
 Installing all speedups in one command
 --------------------------------------
 
-The following will get you ``aiohttp`` along with :term:`cchardet`,
-:term:`aiodns` and ``Brotli`` in one bundle. No need to type
-separate commands anymore!
+The following will get you ``aiohttp`` along with :term:`aiodns` and ``Brotli`` in one
+bundle.
+No need to type separate commands anymore!
 
 .. code-block:: bash
 
@@ -158,17 +149,8 @@ Dependencies
 
 - *async_timeout*
 - *attrs*
-- *charset-normalizer*
 - *multidict*
 - *yarl*
-- *Optional* :term:`cchardet` as faster replacement for
-  :term:`charset-normalizer`.
-
-  Install it explicitly via:
-
-  .. code-block:: bash
-
-     $ pip install cchardet
 
 - *Optional* :term:`aiodns` for fast DNS resolving. The
   library is highly recommended.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Replace automatic character set detection with a `fallback_charset_resolver` parameter
		in `ClientSession` to allow user-supplied character set detection functions.