Skip to content

Commit 68dedf5

Browse files
committed
Fix canonicalize_url() on Python 3 and re-enable tests
1 parent 73a5571 commit 68dedf5

File tree

2 files changed

+215
-22
lines changed

2 files changed

+215
-22
lines changed

scrapy/utils/url.py

Lines changed: 144 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,18 @@
77
"""
88
import posixpath
99
import re
10+
import six
1011
from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
1112
urlparse, parse_qsl, urlencode,
12-
unquote)
13+
quote, unquote)
14+
if six.PY3:
15+
from urllib.parse import unquote_to_bytes
1316

1417
# scrapy.utils.url was moved to w3lib.url and import * ensures this
1518
# move doesn't break old code
1619
from w3lib.url import *
1720
from w3lib.url import _safe_chars
18-
from scrapy.utils.python import to_native_str
21+
from scrapy.utils.python import to_bytes, to_native_str, to_unicode
1922

2023

2124
def url_is_from_any_domain(url, domains):
@@ -37,42 +40,114 @@ def url_has_any_extension(url, extensions):
3740
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
3841

3942

43+
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
44+
return (
45+
to_native_str(parts.scheme),
46+
to_native_str(parts.netloc.encode('idna')),
47+
48+
# default encoding for path component SHOULD be UTF-8
49+
quote(to_bytes(parts.path, path_encoding), _safe_chars),
50+
quote(to_bytes(parts.params, path_encoding), _safe_chars),
51+
52+
# encoding of query and fragment follows page encoding
53+
# or form-charset (if known and passed)
54+
quote(to_bytes(parts.query, encoding), _safe_chars),
55+
quote(to_bytes(parts.fragment, encoding), _safe_chars)
56+
)
57+
58+
4059
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
4160
encoding=None):
4261
"""Canonicalize the given url by applying the following procedures:
4362
4463
- sort query arguments, first by key, then by value
45-
- percent encode paths and query arguments. non-ASCII characters are
46-
percent-encoded using UTF-8 (RFC-3986)
64+
- percent encode paths ; non-ASCII characters are percent-encoded
65+
using UTF-8 (RFC-3986)
66+
- percent encode query arguments ; non-ASCII characters are percent-encoded
67+
using passed `encoding` (UTF-8 by default)
4768
- normalize all spaces (in query arguments) '+' (plus symbol)
4869
- normalize percent encodings case (%2f -> %2F)
49-
- remove query arguments with blank values (unless keep_blank_values is True)
50-
- remove fragments (unless keep_fragments is True)
70+
- remove query arguments with blank values (unless `keep_blank_values` is True)
71+
- remove fragments (unless `keep_fragments` is True)
5172
52-
The url passed can be a str or unicode, while the url returned is always a
53-
str.
73+
The url passed can be bytes or unicode, while the url returned is
74+
always a native str (bytes in Python 2, unicode in Python 3).
5475
5576
For examples see the tests in tests/test_utils_url.py
5677
"""
78+
# If supplied `encoding` is not compatible with all characters in `url`,
79+
# fallback to UTF-8 as safety net.
80+
# UTF-8 can handle all Unicode characters,
81+
# so we should be covered regarding URL normalization,
82+
# if not for proper URL expected by remote website.
83+
try:
84+
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
85+
parse_url(url), encoding=encoding)
86+
except UnicodeError as e:
87+
if encoding != 'utf8':
88+
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
89+
parse_url(url), encoding='utf8')
90+
else:
91+
raise
5792

58-
scheme, netloc, path, params, query, fragment = parse_url(url)
59-
keyvals = parse_qsl(query, keep_blank_values)
93+
# 1. decode query-string as UTF-8 (or keep raw bytes),
94+
# sort values,
95+
# and percent-encode them back
96+
if not six.PY2:
97+
# Python3's urllib.parse.parse_qsl does not work as wanted
98+
# for percent-encoded characters that do not match passed encoding,
99+
# they get lost.
100+
#
101+
# e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
102+
# (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
103+
# instead of \xa3 that you get with Python2's parse_qsl)
104+
#
105+
# what we want here is to keep raw bytes, and percent encode them
106+
# so as to preserve whatever encoding what originally used.
107+
#
108+
# See https://tools.ietf.org/html/rfc3987#section-6.4:
109+
#
110+
# For example, it is possible to have a URI reference of
111+
# "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
112+
# document name is encoded in iso-8859-1 based on server settings, but
113+
# where the fragment identifier is encoded in UTF-8 according to
114+
# [XPointer]. The IRI corresponding to the above URI would be (in XML
115+
# notation)
116+
# "http://www.example.org/r%E9sum%E9.xml#résumé".
117+
# Similar considerations apply to query parts. The functionality of
118+
# IRIs (namely, to be able to include non-ASCII characters) can only be
119+
# used if the query part is encoded in UTF-8.
120+
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
121+
else:
122+
keyvals = parse_qsl(query, keep_blank_values)
60123
keyvals.sort()
61124
query = urlencode(keyvals)
62125

63-
# XXX: copied from w3lib.url.safe_url_string to add encoding argument
64-
# path = to_native_str(path, encoding)
65-
# path = moves.urllib.parse.quote(path, _safe_chars, encoding='latin1') or '/'
126+
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
127+
# and percent-encode path again (this normalizes to upper-case %XX)
128+
uqp = _unquotepath(path)
129+
path = quote(uqp, _safe_chars) or '/'
66130

67-
path = safe_url_string(_unquotepath(path)) or '/'
68131
fragment = '' if not keep_fragments else fragment
132+
133+
# every part should be safe already
69134
return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
70135

71136

72137
def _unquotepath(path):
73138
for reserved in ('2f', '2F', '3f', '3F'):
74139
path = path.replace('%' + reserved, '%25' + reserved.upper())
75-
return unquote(path)
140+
141+
if six.PY3:
142+
# standard lib's unquote() does not work in Python 3
143+
# for non-UTF-8 percent-escaped characters, they get lost.
144+
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
145+
#
146+
# unquote_to_bytes() returns raw bytes instead
147+
return unquote_to_bytes(path)
148+
else:
149+
# in Python 2, '%a3' becomes '\xa3', which is what we want
150+
return unquote(path)
76151

77152

78153
def parse_url(url, encoding=None):
@@ -81,7 +156,60 @@ def parse_url(http://github.com/url,%20encoding=None):
81156
"""
82157
if isinstance(url, ParseResult):
83158
return url
84-
return urlparse(to_native_str(url, encoding))
159+
return urlparse(to_unicode(url, encoding))
160+
161+
162+
if six.PY3:
163+
from urllib.parse import _coerce_args, unquote_to_bytes
164+
165+
def parse_qsl_to_bytes(qs, keep_blank_values=False, strict_parsing=False):
166+
"""Parse a query given as a string argument.
167+
168+
Data are returned as a list of name, value pairs as bytes.
169+
170+
Arguments:
171+
172+
qs: percent-encoded query string to be parsed
173+
174+
keep_blank_values: flag indicating whether blank values in
175+
percent-encoded queries should be treated as blank strings. A
176+
true value indicates that blanks should be retained as blank
177+
strings. The default false value indicates that blank values
178+
are to be ignored and treated as if they were not included.
179+
180+
strict_parsing: flag indicating what to do with parsing errors. If
181+
false (the default), errors are silently ignored. If true,
182+
errors raise a ValueError exception.
183+
184+
"""
185+
# This code is the same as Python3's parse_qsl()
186+
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
187+
# except for the unquote(s, encoding, errors) calls replaced
188+
# with unquote_to_bytes(s)
189+
qs, _coerce_result = _coerce_args(qs)
190+
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
191+
r = []
192+
for name_value in pairs:
193+
if not name_value and not strict_parsing:
194+
continue
195+
nv = name_value.split('=', 1)
196+
if len(nv) != 2:
197+
if strict_parsing:
198+
raise ValueError("bad query field: %r" % (name_value,))
199+
# Handle case of a control-name with no equal sign
200+
if keep_blank_values:
201+
nv.append('')
202+
else:
203+
continue
204+
if len(nv[1]) or keep_blank_values:
205+
name = nv[0].replace('+', ' ')
206+
name = unquote_to_bytes(name)
207+
name = _coerce_result(name)
208+
value = nv[1].replace('+', ' ')
209+
value = unquote_to_bytes(value)
210+
value = _coerce_result(value)
211+
r.append((name, value))
212+
return r
85213

86214

87215
def escape_ajax(url):

tests/test_utils_url.py

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import unittest
33

44
import six
5+
from six.moves.urllib.parse import urlparse
6+
57
from scrapy.spiders import Spider
68
from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
79
canonicalize_url, add_http_if_no_scheme,
8-
guess_scheme)
10+
guess_scheme, parse_url)
911

1012
__doctests__ = ['scrapy.utils.url']
1113

@@ -123,16 +125,55 @@ def test_spaces(self):
123125
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
124126
"http://www.example.com/do?a=1&q=a+space")
125127

126-
@unittest.skipUnless(six.PY2, "TODO")
128+
def test_canonicalize_url_unicode_path(self):
129+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
130+
"http://www.example.com/r%C3%A9sum%C3%A9")
131+
132+
def test_canonicalize_url_unicode_query_string(self):
133+
# default encoding for path and query is UTF-8
134+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
135+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
136+
137+
# passed encoding will affect query string
138+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
139+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")
140+
141+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
142+
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
143+
144+
def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
145+
# trying to encode with wrong encoding
146+
# fallback to UTF-8
147+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
148+
"http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")
149+
150+
self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
151+
"http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
152+
127153
def test_normalize_percent_encoding_in_paths(self):
154+
self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
155+
"http://www.example.com/r%C3%A9sum%C3%A9")
156+
157+
# non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
158+
# 'latin1'-encoded sequence in path
128159
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
129-
"http://www.example.com/a%A3do"),
160+
"http://www.example.com/a%A3do")
161+
162+
# 'latin1'-encoded path, UTF-8 encoded query string
163+
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
164+
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
165+
166+
# 'latin1'-encoded path and query string
167+
self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
168+
"http://www.example.com/a%A3do?q=r%E9sum%E9")
130169

131-
@unittest.skipUnless(six.PY2, "TODO")
132170
def test_normalize_percent_encoding_in_query_arguments(self):
133171
self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
134172
"http://www.example.com/do?k=b%A3")
135173

174+
self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
175+
"http://www.example.com/do?k=r%C3%A9sum%C3%A9")
176+
136177
def test_non_ascii_percent_encoding_in_paths(self):
137178
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
138179
"http://www.example.com/a%20do?a=1"),
@@ -144,7 +185,7 @@ def test_non_ascii_percent_encoding_in_paths(self):
144185
"http://www.example.com/a%20do%C2%A3.html?a=1")
145186

146187
def test_non_ascii_percent_encoding_in_query_arguments(self):
147-
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=%3Cspan%20class="pl-cce x x-first">\xa3500&a=5&z=3"),
188+
self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=%3Cspan%20class="x x-first x-last">£500&a=5&z=3"),
148189
u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
149190
self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
150191
"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
@@ -167,7 +208,6 @@ def test_dont_convert_safe_characters(self):
167208
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
168209
"http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
169210

170-
@unittest.skipUnless(six.PY2, "TODO")
171211
def test_safe_characters_unicode(self):
172212
# urllib.quote uses a mapping cache of encoded characters. when parsing
173213
# an already percent-encoded url, it will fail if that url was not
@@ -181,12 +221,37 @@ def test_domains_are_case_insensitive(self):
181221
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
182222
"http://www.example.com/")
183223

224+
def test_canonicalize_idns(self):
225+
self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
226+
'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
227+
# Japanese (+ reordering query parameters)
228+
self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
229+
'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
230+
184231
def test_quoted_slash_and_question_sign(self):
185232
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
186233
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
187234
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
188235
"http://foo.com/AC%2FDC/")
189236

237+
def test_canonicalize_urlparsed(self):
238+
# canonicalize_url() can be passed an already urlparse'd URL
239+
self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
240+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
241+
self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
242+
'http://www.example.com/caf%E9-con-leche.htm')
243+
self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
244+
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
245+
246+
def test_canonicalize_parse_url(self):
247+
# parse_url() wraps urlparse and is used in link extractors
248+
self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
249+
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
250+
self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
251+
'http://www.example.com/caf%E9-con-leche.htm')
252+
self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
253+
"http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
254+
190255

191256
class AddHttpIfNoScheme(unittest.TestCase):
192257

0 commit comments

Comments
 (0)