Skip to content

Commit 25401fd

Browse files
committed
Use six.PY2 instead of six.PY3 for Python version variations
Also don't test passed encoding against 'utf8'; Just consider that if encoding failed, it must have been another encoding.
1 parent 68dedf5 commit 25401fd

File tree

1 file changed

+13
-15
lines changed

1 file changed

+13
-15
lines changed

scrapy/utils/url.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,16 @@ def canonicalize_url(http://github.com/url,%20keep_blank_values=True,%20keep_fragments=False,%3C/div%3E%3C/code%3E%3C/div%3E%3C/td%3E%3C/tr%3E%3Ctr%20class=%22diff-line-row%22%3E%3Ctd%20data-grid-cell-id=%22diff-5e903e2ea0d927846a0d01988f13b868e2a6a1b0c7099452370d9e411f224052-83-83-0%22%20data-selected=%22false%22%20role=%22gridcell%22%20style=%22background-color:var(--bgColor-default);text-align:center" tabindex="-1" valign="top" class="focusable-grid-cell diff-line-number position-relative diff-line-number-neutral left-side">83
83
try:
8484
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
8585
parse_url(url), encoding=encoding)
86-
except UnicodeError as e:
87-
if encoding != 'utf8':
88-
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
89-
parse_url(url), encoding='utf8')
90-
else:
91-
raise
86+
except UnicodeEncodeError as e:
87+
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
88+
parse_url(url), encoding='utf8')
9289

9390
# 1. decode query-string as UTF-8 (or keep raw bytes),
9491
# sort values,
9592
# and percent-encode them back
96-
if not six.PY2:
93+
if six.PY2:
94+
keyvals = parse_qsl(query, keep_blank_values)
95+
else:
9796
# Python3's urllib.parse.parse_qsl does not work as wanted
9897
# for percent-encoded characters that do not match passed encoding,
9998
# they get lost.
@@ -118,8 +117,6 @@ def canonicalize_url(http://github.com/url,%20keep_blank_values=True,%20keep_fragments=False,%3C/div%3E%3C/code%3E%3C/div%3E%3C/td%3E%3C/tr%3E%3Ctr%20class=%22diff-line-row%22%3E%3Ctd%20data-grid-cell-id=%22diff-5e903e2ea0d927846a0d01988f13b868e2a6a1b0c7099452370d9e411f224052-118-117-0%22%20data-selected=%22false%22%20role=%22gridcell%22%20style=%22background-color:var(--bgColor-default);text-align:center" tabindex="-1" valign="top" class="focusable-grid-cell diff-line-number position-relative diff-line-number-neutral left-side">118
117
# IRIs (namely, to be able to include non-ASCII characters) can only be
119118
# used if the query part is encoded in UTF-8.
120119
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
121-
else:
122-
keyvals = parse_qsl(query, keep_blank_values)
123120
keyvals.sort()
124121
query = urlencode(keyvals)
125122

@@ -138,16 +135,17 @@ def _unquotepath(path):
138135
for reserved in ('2f', '2F', '3f', '3F'):
139136
path = path.replace('%' + reserved, '%25' + reserved.upper())
140137

141-
if six.PY3:
142-
# standard lib's unquote() does not work in Python 3
143-
# for non-UTF-8 percent-escaped characters, they get lost.
138+
if six.PY2:
139+
# in Python 2, '%a3' becomes '\xa3', which is what we want
140+
return unquote(path)
141+
else:
142+
# in Python 3,
143+
# standard lib's unquote() does not work for non-UTF-8
144+
# percent-escaped characters, they get lost.
144145
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
145146
#
146147
# unquote_to_bytes() returns raw bytes instead
147148
return unquote_to_bytes(path)
148-
else:
149-
# in Python 2, '%a3' becomes '\xa3', which is what we want
150-
return unquote(path)
151149

152150

153151
def parse_url(url, encoding=None):

0 commit comments

Comments
 (0)