Skip to content

Commit 0e1e9a0

Browse files
committed
detect UTF encodings when loading json
1 parent aab4c8c commit 0e1e9a0

File tree

3 files changed

+67
-23
lines changed

3 files changed

+67
-23
lines changed

flask/json.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
:copyright: (c) 2015 by Armin Ronacher.
99
:license: BSD, see LICENSE for more details.
1010
"""
11+
import codecs
1112
import io
1213
import uuid
1314
from datetime import date
@@ -108,6 +109,49 @@ def _load_arg_defaults(kwargs):
108109
kwargs.setdefault('cls', JSONDecoder)
109110

110111

112+
def detect_encoding(data):
113+
"""Detect which UTF codec was used to encode the given bytes.
114+
115+
The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is
116+
accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big
117+
or little endian. Some editors or libraries may prepend a BOM.
118+
119+
:param data: Bytes in unknown UTF encoding.
120+
:return: UTF encoding name
121+
"""
122+
head = data[:4]
123+
124+
if head[:3] == codecs.BOM_UTF8:
125+
return 'utf-8-sig'
126+
127+
if b'\x00' not in head:
128+
return 'utf-8'
129+
130+
if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE):
131+
return 'utf-32'
132+
133+
if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
134+
return 'utf-16'
135+
136+
if len(head) == 4:
137+
if head[:3] == b'\x00\x00\x00':
138+
return 'utf-32-be'
139+
140+
if head[::2] == b'\x00\x00':
141+
return 'utf-16-be'
142+
143+
if head[1:] == b'\x00\x00\x00':
144+
return 'utf-32-le'
145+
146+
if head[1::2] == b'\x00\x00':
147+
return 'utf-16-le'
148+
149+
if len(head) == 2:
150+
return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le'
151+
152+
return 'utf-8'
153+
154+
111155
def dumps(obj, **kwargs):
112156
"""Serialize ``obj`` to a JSON formatted ``str`` by using the application's
113157
configured encoder (:attr:`~flask.Flask.json_encoder`) if there is an
@@ -142,7 +186,10 @@ def loads(s, **kwargs):
142186
"""
143187
_load_arg_defaults(kwargs)
144188
if isinstance(s, bytes):
145-
s = s.decode(kwargs.pop('encoding', None) or 'utf-8')
189+
encoding = kwargs.pop('encoding', None)
190+
if encoding is None:
191+
encoding = detect_encoding(s)
192+
s = s.decode(encoding)
146193
return _json.loads(s, **kwargs)
147194

148195

flask/wrappers.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,10 @@ def get_json(self, force=False, silent=False, cache=True):
144144
if not (force or self.is_json):
145145
return None
146146

147-
# We accept a request charset against the specification as
148-
# certain clients have been using this in the past. This
149-
# fits our general approach of being nice in what we accept
150-
# and strict in what we send out.
151-
request_charset = self.mimetype_params.get('charset')
147+
data = _get_data(self, cache)
148+
152149
try:
153-
data = _get_data(self, cache)
154-
if request_charset is not None:
155-
rv = json.loads(data, encoding=request_charset)
156-
else:
157-
rv = json.loads(data)
150+
rv = json.loads(data)
158151
except ValueError as e:
159152
if silent:
160153
rv = None

tests/test_helpers.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from werkzeug.exceptions import BadRequest, NotFound
2222
from werkzeug.http import parse_cache_control_header, parse_options_header
2323
from werkzeug.http import http_date
24+
25+
from flask import json
2426
from flask._compat import StringIO, text_type
2527

2628

@@ -34,6 +36,20 @@ def has_encoding(name):
3436

3537

3638
class TestJSON(object):
39+
@pytest.mark.parametrize('value', (
40+
1, 't', True, False, None,
41+
[], [1, 2, 3],
42+
{}, {'foo': u'🐍'},
43+
))
44+
@pytest.mark.parametrize('encoding', (
45+
'utf-8', 'utf-8-sig',
46+
'utf-16-le', 'utf-16-be', 'utf-16',
47+
'utf-32-le', 'utf-32-be', 'utf-32',
48+
))
49+
def test_detect_encoding(self, value, encoding):
50+
data = json.dumps(value).encode(encoding)
51+
assert json.detect_encoding(data) == encoding
52+
assert json.loads(data) == value
3753

3854
def test_ignore_cached_json(self):
3955
app = flask.Flask(__name__)
@@ -85,18 +101,6 @@ def return_json():
85101
rv = c.post('/json', data='"foo"', content_type='application/x+json')
86102
assert rv.data == b'foo'
87103

88-
def test_json_body_encoding(self):
89-
app = flask.Flask(__name__)
90-
app.testing = True
91-
@app.route('/')
92-
def index():
93-
return flask.request.get_json()
94-
95-
c = app.test_client()
96-
resp = c.get('/', data=u'"Hällo Wörld"'.encode('iso-8859-15'),
97-
content_type='application/json; charset=iso-8859-15')
98-
assert resp.data == u'Hällo Wörld'.encode('utf-8')
99-
100104
def test_json_as_unicode(self):
101105
app = flask.Flask(__name__)
102106

0 commit comments

Comments
 (0)