From ab4142215d836b0298fc47fa1e4b75408b9c37a0 Mon Sep 17 00:00:00 2001 From: David Lord Date: Tue, 10 Apr 2018 09:29:48 -0700 Subject: [PATCH] detect UTF encodings when loading json --- CHANGES.rst | 4 ++++ flask/json/__init__.py | 50 ++++++++++++++++++++++++++++++++++++++++-- flask/wrappers.py | 24 ++++++++++---------- tests/test_helpers.py | 30 +++++++++++++++---------- 4 files changed, 82 insertions(+), 26 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 93e720e723..3f5a003e69 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -147,6 +147,9 @@ unreleased when it is registered with the app. (`#2629`_) - :meth:`Request.get_json() ` doesn't cache the result if parsing fails when ``silent`` is true. (`#2651`_) +- :func:`request.get_json ` no longer accepts + arbitrary encodings. Incoming JSON should be encoded using UTF-8 per + :rfc:`8259`, but Flask will autodetect UTF-8, -16, or -32. (`#2691`_) .. _pallets/meta#24: https://github.com/pallets/meta/issues/24 .. _#1421: https://github.com/pallets/flask/issues/1421 @@ -192,6 +195,7 @@ unreleased .. _#2635: https://github.com/pallets/flask/pull/2635 .. _#2629: https://github.com/pallets/flask/pull/2629 .. _#2651: https://github.com/pallets/flask/issues/2651 +.. _#2691: https://github.com/pallets/flask/pull/2691 Version 0.12.2 diff --git a/flask/json/__init__.py b/flask/json/__init__.py index f482c72c3e..fbe6b92f0a 100644 --- a/flask/json/__init__.py +++ b/flask/json/__init__.py @@ -6,7 +6,7 @@ :copyright: © 2010 by the Pallets team. :license: BSD, see LICENSE for more details. """ - +import codecs import io import uuid from datetime import date, datetime @@ -121,6 +121,49 @@ def _load_arg_defaults(kwargs): kwargs.setdefault('cls', JSONDecoder) +def detect_encoding(data): + """Detect which UTF codec was used to encode the given bytes. + + The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is + accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big + or little endian. Some editors or libraries may prepend a BOM. + + :param data: Bytes in unknown UTF encoding. + :return: UTF encoding name + """ + head = data[:4] + + if head[:3] == codecs.BOM_UTF8: + return 'utf-8-sig' + + if b'\x00' not in head: + return 'utf-8' + + if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE): + return 'utf-32' + + if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE): + return 'utf-16' + + if len(head) == 4: + if head[:3] == b'\x00\x00\x00': + return 'utf-32-be' + + if head[::2] == b'\x00\x00': + return 'utf-16-be' + + if head[1:] == b'\x00\x00\x00': + return 'utf-32-le' + + if head[1::2] == b'\x00\x00': + return 'utf-16-le' + + if len(head) == 2: + return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le' + + return 'utf-8' + + def dumps(obj, **kwargs): """Serialize ``obj`` to a JSON formatted ``str`` by using the application's configured encoder (:attr:`~flask.Flask.json_encoder`) if there is an @@ -155,7 +198,10 @@ def loads(s, **kwargs): """ _load_arg_defaults(kwargs) if isinstance(s, bytes): - s = s.decode(kwargs.pop('encoding', None) or 'utf-8') + encoding = kwargs.pop('encoding', None) + if encoding is None: + encoding = detect_encoding(s) + s = s.decode(encoding) return _json.loads(s, **kwargs) diff --git a/flask/wrappers.py b/flask/wrappers.py index 896652faf8..0a2bd2e42a 100644 --- a/flask/wrappers.py +++ b/flask/wrappers.py @@ -50,15 +50,17 @@ def _get_data_for_json(self, cache): return self.get_data(cache=cache) def get_json(self, force=False, silent=False, cache=True): - """Parse and return the data as JSON. If the mimetype does not indicate - JSON (:mimetype:`application/json`, see :meth:`is_json`), this returns - ``None`` unless ``force`` is true. If parsing fails, - :meth:`on_json_loading_failed` is called and its return value is used - as the return value. + """Parse and return the data as JSON. If the mimetype does not + indicate JSON (:mimetype:`application/json`, see + :meth:`is_json`), this returns ``None`` unless ``force`` is + true. If parsing fails, :meth:`on_json_loading_failed` is called + and its return value is used as the return value. :param force: Ignore the mimetype and always try to parse JSON. - :param silent: Silence parsing errors and return ``None`` instead. - :param cache: Store the parsed JSON to return for subsequent calls. + :param silent: Silence parsing errors and return ``None`` + instead. + :param cache: Store the parsed JSON to return for subsequent + calls. """ if cache and self._cached_json[silent] is not Ellipsis: return self._cached_json[silent] @@ -66,14 +68,10 @@ def get_json(self, force=False, silent=False, cache=True): if not (force or self.is_json): return None - # We accept MIME charset against the specification as certain clients - # have used this in the past. For responses, we assume that if the - # charset is set then the data has been encoded correctly as well. - charset = self.mimetype_params.get('charset') + data = self._get_data_for_json(cache=cache) try: - data = self._get_data_for_json(cache=cache) - rv = json.loads(data, encoding=charset) + rv = json.loads(data) except ValueError as e: if silent: rv = None diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a3031878b4..b3535b2831 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -16,10 +16,13 @@ import pytest from werkzeug.datastructures import Range from werkzeug.exceptions import BadRequest, NotFound -from werkzeug.http import http_date, parse_cache_control_header, \ +from werkzeug.http import ( + http_date, parse_cache_control_header, parse_options_header +) import flask +from flask import json from flask._compat import StringIO, text_type from flask.helpers import get_debug_flag, get_env @@ -55,6 +58,21 @@ def dst(self, dt): class TestJSON(object): + @pytest.mark.parametrize('value', ( + 1, 't', True, False, None, + [], [1, 2, 3], + {}, {'foo': u'🐍'}, + )) + @pytest.mark.parametrize('encoding', ( + 'utf-8', 'utf-8-sig', + 'utf-16-le', 'utf-16-be', 'utf-16', + 'utf-32-le', 'utf-32-be', 'utf-32', + )) + def test_detect_encoding(self, value, encoding): + data = json.dumps(value).encode(encoding) + assert json.detect_encoding(data) == encoding + assert json.loads(data) == value + def test_ignore_cached_json(self, app): with app.test_request_context('/', method='POST', data='malformed', content_type='application/json'): @@ -121,16 +139,6 @@ def return_json(): rv = client.post('/json', data='"foo"', content_type='application/x+json') assert rv.data == b'foo' - def test_json_body_encoding(self, app, client): - - @app.route('/') - def index(): - return flask.request.get_json() - - resp = client.get('/', data=u'"Hällo Wörld"'.encode('iso-8859-15'), - content_type='application/json; charset=iso-8859-15') - assert resp.data == u'Hällo Wörld'.encode('utf-8') - @pytest.mark.parametrize('test_value,expected', [(True, '"\\u2603"'), (False, u'"\u2603"')]) def test_json_as_unicode(self, test_value, expected, app, app_ctx):