Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

detect UTF encodings when loading json #2691

Merged
merged 1 commit into from
Apr 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ unreleased
when it is registered with the app. (`#2629`_)
- :meth:`Request.get_json() <flask.Request.get_json>` doesn't cache the
result if parsing fails when ``silent`` is true. (`#2651`_)
- :func:`request.get_json <flask.Request.get_json>` no longer accepts
arbitrary encodings. Incoming JSON should be encoded using UTF-8 per
:rfc:`8259`, but Flask will autodetect UTF-8, -16, or -32. (`#2691`_)

.. _pallets/meta#24: https://github.com/pallets/meta/issues/24
.. _#1421: https://github.com/pallets/flask/issues/1421
Expand Down Expand Up @@ -192,6 +195,7 @@ unreleased
.. _#2635: https://github.com/pallets/flask/pull/2635
.. _#2629: https://github.com/pallets/flask/pull/2629
.. _#2651: https://github.com/pallets/flask/issues/2651
.. _#2691: https://github.com/pallets/flask/pull/2691


Version 0.12.2
Expand Down
50 changes: 48 additions & 2 deletions flask/json/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
:copyright: © 2010 by the Pallets team.
:license: BSD, see LICENSE for more details.
"""

import codecs
import io
import uuid
from datetime import date, datetime
Expand Down Expand Up @@ -121,6 +121,49 @@ def _load_arg_defaults(kwargs):
kwargs.setdefault('cls', JSONDecoder)


def detect_encoding(data):
"""Detect which UTF codec was used to encode the given bytes.

The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is
accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big
or little endian. Some editors or libraries may prepend a BOM.

:param data: Bytes in unknown UTF encoding.
:return: UTF encoding name
"""
head = data[:4]

if head[:3] == codecs.BOM_UTF8:
return 'utf-8-sig'

if b'\x00' not in head:
return 'utf-8'

if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE):
return 'utf-32'

if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
return 'utf-16'

if len(head) == 4:
if head[:3] == b'\x00\x00\x00':
return 'utf-32-be'

if head[::2] == b'\x00\x00':
return 'utf-16-be'

if head[1:] == b'\x00\x00\x00':
return 'utf-32-le'

if head[1::2] == b'\x00\x00':
return 'utf-16-le'

if len(head) == 2:
return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le'

return 'utf-8'


def dumps(obj, **kwargs):
"""Serialize ``obj`` to a JSON formatted ``str`` by using the application's
configured encoder (:attr:`~flask.Flask.json_encoder`) if there is an
Expand Down Expand Up @@ -155,7 +198,10 @@ def loads(s, **kwargs):
"""
_load_arg_defaults(kwargs)
if isinstance(s, bytes):
s = s.decode(kwargs.pop('encoding', None) or 'utf-8')
encoding = kwargs.pop('encoding', None)
if encoding is None:
encoding = detect_encoding(s)
s = s.decode(encoding)
return _json.loads(s, **kwargs)


Expand Down
24 changes: 11 additions & 13 deletions flask/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,28 @@ def _get_data_for_json(self, cache):
return self.get_data(cache=cache)

def get_json(self, force=False, silent=False, cache=True):
"""Parse and return the data as JSON. If the mimetype does not indicate
JSON (:mimetype:`application/json`, see :meth:`is_json`), this returns
``None`` unless ``force`` is true. If parsing fails,
:meth:`on_json_loading_failed` is called and its return value is used
as the return value.
"""Parse and return the data as JSON. If the mimetype does not
indicate JSON (:mimetype:`application/json`, see
:meth:`is_json`), this returns ``None`` unless ``force`` is
true. If parsing fails, :meth:`on_json_loading_failed` is called
and its return value is used as the return value.

:param force: Ignore the mimetype and always try to parse JSON.
:param silent: Silence parsing errors and return ``None`` instead.
:param cache: Store the parsed JSON to return for subsequent calls.
:param silent: Silence parsing errors and return ``None``
instead.
:param cache: Store the parsed JSON to return for subsequent
calls.
"""
if cache and self._cached_json[silent] is not Ellipsis:
return self._cached_json[silent]

if not (force or self.is_json):
return None

# We accept MIME charset against the specification as certain clients
# have used this in the past. For responses, we assume that if the
# charset is set then the data has been encoded correctly as well.
charset = self.mimetype_params.get('charset')
data = self._get_data_for_json(cache=cache)

try:
data = self._get_data_for_json(cache=cache)
rv = json.loads(data, encoding=charset)
rv = json.loads(data)
except ValueError as e:
if silent:
rv = None
Expand Down
30 changes: 19 additions & 11 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@
import pytest
from werkzeug.datastructures import Range
from werkzeug.exceptions import BadRequest, NotFound
from werkzeug.http import http_date, parse_cache_control_header, \
from werkzeug.http import (
http_date, parse_cache_control_header,
parse_options_header
)

import flask
from flask import json
from flask._compat import StringIO, text_type
from flask.helpers import get_debug_flag, get_env

Expand Down Expand Up @@ -55,6 +58,21 @@ def dst(self, dt):


class TestJSON(object):
@pytest.mark.parametrize('value', (
1, 't', True, False, None,
[], [1, 2, 3],
{}, {'foo': u'🐍'},
))
@pytest.mark.parametrize('encoding', (
'utf-8', 'utf-8-sig',
'utf-16-le', 'utf-16-be', 'utf-16',
'utf-32-le', 'utf-32-be', 'utf-32',
))
def test_detect_encoding(self, value, encoding):
data = json.dumps(value).encode(encoding)
assert json.detect_encoding(data) == encoding
assert json.loads(data) == value

def test_ignore_cached_json(self, app):
with app.test_request_context('/', method='POST', data='malformed',
content_type='application/json'):
Expand Down Expand Up @@ -121,16 +139,6 @@ def return_json():
rv = client.post('/json', data='"foo"', content_type='application/x+json')
assert rv.data == b'foo'

def test_json_body_encoding(self, app, client):

@app.route('/')
def index():
return flask.request.get_json()

resp = client.get('/', data=u'"Hällo Wörld"'.encode('iso-8859-15'),
content_type='application/json; charset=iso-8859-15')
assert resp.data == u'Hällo Wörld'.encode('utf-8')

@pytest.mark.parametrize('test_value,expected', [(True, '"\\u2603"'), (False, u'"\u2603"')])
def test_json_as_unicode(self, test_value, expected, app, app_ctx):

Expand Down