7
7
"""
8
8
import posixpath
9
9
import re
10
+ import six
10
11
from six .moves .urllib .parse import (ParseResult , urlunparse , urldefrag ,
11
12
urlparse , parse_qsl , urlencode ,
12
- unquote )
13
+ quote , unquote )
14
+ if six .PY3 :
15
+ from urllib .parse import unquote_to_bytes
13
16
14
17
# scrapy.utils.url was moved to w3lib.url and import * ensures this
15
18
# move doesn't break old code
16
19
from w3lib .url import *
17
20
from w3lib .url import _safe_chars
18
- from scrapy .utils .python import to_native_str
21
+ from scrapy .utils .python import to_bytes , to_native_str , to_unicode
19
22
20
23
21
24
def url_is_from_any_domain (url , domains ):
@@ -37,42 +40,114 @@ def url_has_any_extension(url, extensions):
37
40
return posixpath .splitext (parse_url (url ).path )[1 ].lower () in extensions
38
41
39
42
43
+ def _safe_ParseResult (parts , encoding = 'utf8' , path_encoding = 'utf8' ):
44
+ return (
45
+ to_native_str (parts .scheme ),
46
+ to_native_str (parts .netloc .encode ('idna' )),
47
+
48
+ # default encoding for path component SHOULD be UTF-8
49
+ quote (to_bytes (parts .path , path_encoding ), _safe_chars ),
50
+ quote (to_bytes (parts .params , path_encoding ), _safe_chars ),
51
+
52
+ # encoding of query and fragment follows page encoding
53
+ # or form-charset (if known and passed)
54
+ quote (to_bytes (parts .query , encoding ), _safe_chars ),
55
+ quote (to_bytes (parts .fragment , encoding ), _safe_chars )
56
+ )
57
+
58
+
40
59
def canonicalize_url (url , keep_blank_values = True , keep_fragments = False ,
41
60
encoding = None ):
42
61
"""Canonicalize the given url by applying the following procedures:
43
62
44
63
- sort query arguments, first by key, then by value
45
- - percent encode paths and query arguments. non-ASCII characters are
46
- percent-encoded using UTF-8 (RFC-3986)
64
+ - percent encode paths ; non-ASCII characters are percent-encoded
65
+ using UTF-8 (RFC-3986)
66
+ - percent encode query arguments ; non-ASCII characters are percent-encoded
67
+ using passed `encoding` (UTF-8 by default)
47
68
- normalize all spaces (in query arguments) '+' (plus symbol)
48
69
- normalize percent encodings case (%2f -> %2F)
49
- - remove query arguments with blank values (unless keep_blank_values is True)
50
- - remove fragments (unless keep_fragments is True)
70
+ - remove query arguments with blank values (unless ` keep_blank_values` is True)
71
+ - remove fragments (unless ` keep_fragments` is True)
51
72
52
- The url passed can be a str or unicode, while the url returned is always a
53
- str.
73
+ The url passed can be bytes or unicode, while the url returned is
74
+ always a native str (bytes in Python 2, unicode in Python 3) .
54
75
55
76
For examples see the tests in tests/test_utils_url.py
56
77
"""
78
+ # If supplied `encoding` is not compatible with all characters in `url`,
79
+ # fallback to UTF-8 as safety net.
80
+ # UTF-8 can handle all Unicode characters,
81
+ # so we should be covered regarding URL normalization,
82
+ # if not for proper URL expected by remote website.
83
+ try :
84
+ scheme , netloc , path , params , query , fragment = _safe_ParseResult (
85
+ parse_url (url ), encoding = encoding )
86
+ except UnicodeError as e :
87
+ if encoding != 'utf8' :
88
+ scheme , netloc , path , params , query , fragment = _safe_ParseResult (
89
+ parse_url (url ), encoding = 'utf8' )
90
+ else :
91
+ raise
57
92
58
- scheme , netloc , path , params , query , fragment = parse_url (url )
59
- keyvals = parse_qsl (query , keep_blank_values )
93
+ # 1. decode query-string as UTF-8 (or keep raw bytes),
94
+ # sort values,
95
+ # and percent-encode them back
96
+ if not six .PY2 :
97
+ # Python3's urllib.parse.parse_qsl does not work as wanted
98
+ # for percent-encoded characters that do not match passed encoding,
99
+ # they get lost.
100
+ #
101
+ # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
102
+ # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
103
+ # instead of \xa3 that you get with Python2's parse_qsl)
104
+ #
105
+ # what we want here is to keep raw bytes, and percent encode them
106
+ # so as to preserve whatever encoding what originally used.
107
+ #
108
+ # See https://tools.ietf.org/html/rfc3987#section-6.4:
109
+ #
110
+ # For example, it is possible to have a URI reference of
111
+ # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
112
+ # document name is encoded in iso-8859-1 based on server settings, but
113
+ # where the fragment identifier is encoded in UTF-8 according to
114
+ # [XPointer]. The IRI corresponding to the above URI would be (in XML
115
+ # notation)
116
+ # "http://www.example.org/r%E9sum%E9.xml#résumé".
117
+ # Similar considerations apply to query parts. The functionality of
118
+ # IRIs (namely, to be able to include non-ASCII characters) can only be
119
+ # used if the query part is encoded in UTF-8.
120
+ keyvals = parse_qsl_to_bytes (query , keep_blank_values )
121
+ else :
122
+ keyvals = parse_qsl (query , keep_blank_values )
60
123
keyvals .sort ()
61
124
query = urlencode (keyvals )
62
125
63
- # XXX: copied from w3lib.url.safe_url_string to add encoding argument
64
- # path = to_native_str(path, encoding)
65
- # path = moves.urllib.parse.quote(path, _safe_chars, encoding='latin1') or '/'
126
+ # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
127
+ # and percent-encode path again (this normalizes to upper-case %XX)
128
+ uqp = _unquotepath (path )
129
+ path = quote (uqp , _safe_chars ) or '/'
66
130
67
- path = safe_url_string (_unquotepath (path )) or '/'
68
131
fragment = '' if not keep_fragments else fragment
132
+
133
+ # every part should be safe already
69
134
return urlunparse ((scheme , netloc .lower (), path , params , query , fragment ))
70
135
71
136
72
137
def _unquotepath (path ):
73
138
for reserved in ('2f' , '2F' , '3f' , '3F' ):
74
139
path = path .replace ('%' + reserved , '%25' + reserved .upper ())
75
- return unquote (path )
140
+
141
+ if six .PY3 :
142
+ # standard lib's unquote() does not work in Python 3
143
+ # for non-UTF-8 percent-escaped characters, they get lost.
144
+ # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
145
+ #
146
+ # unquote_to_bytes() returns raw bytes instead
147
+ return unquote_to_bytes (path )
148
+ else :
149
+ # in Python 2, '%a3' becomes '\xa3', which is what we want
150
+ return unquote (path )
76
151
77
152
78
153
def parse_url (url , encoding = None ):
@@ -81,7 +156,60 @@ def parse_url(http://github.com/url,%20encoding=None):
81
156
"""
82
157
if isinstance (url , ParseResult ):
83
158
return url
84
- return urlparse (to_native_str (url , encoding ))
159
+ return urlparse (to_unicode (url , encoding ))
160
+
161
+
162
+ if six .PY3 :
163
+ from urllib .parse import _coerce_args , unquote_to_bytes
164
+
165
+ def parse_qsl_to_bytes (qs , keep_blank_values = False , strict_parsing = False ):
166
+ """Parse a query given as a string argument.
167
+
168
+ Data are returned as a list of name, value pairs as bytes.
169
+
170
+ Arguments:
171
+
172
+ qs: percent-encoded query string to be parsed
173
+
174
+ keep_blank_values: flag indicating whether blank values in
175
+ percent-encoded queries should be treated as blank strings. A
176
+ true value indicates that blanks should be retained as blank
177
+ strings. The default false value indicates that blank values
178
+ are to be ignored and treated as if they were not included.
179
+
180
+ strict_parsing: flag indicating what to do with parsing errors. If
181
+ false (the default), errors are silently ignored. If true,
182
+ errors raise a ValueError exception.
183
+
184
+ """
185
+ # This code is the same as Python3's parse_qsl()
186
+ # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
187
+ # except for the unquote(s, encoding, errors) calls replaced
188
+ # with unquote_to_bytes(s)
189
+ qs , _coerce_result = _coerce_args (qs )
190
+ pairs = [s2 for s1 in qs .split ('&' ) for s2 in s1 .split (';' )]
191
+ r = []
192
+ for name_value in pairs :
193
+ if not name_value and not strict_parsing :
194
+ continue
195
+ nv = name_value .split ('=' , 1 )
196
+ if len (nv ) != 2 :
197
+ if strict_parsing :
198
+ raise ValueError ("bad query field: %r" % (name_value ,))
199
+ # Handle case of a control-name with no equal sign
200
+ if keep_blank_values :
201
+ nv .append ('' )
202
+ else :
203
+ continue
204
+ if len (nv [1 ]) or keep_blank_values :
205
+ name = nv [0 ].replace ('+' , ' ' )
206
+ name = unquote_to_bytes (name )
207
+ name = _coerce_result (name )
208
+ value = nv [1 ].replace ('+' , ' ' )
209
+ value = unquote_to_bytes (value )
210
+ value = _coerce_result (value )
211
+ r .append ((name , value ))
212
+ return r
85
213
86
214
87
215
def escape_ajax (url ):
0 commit comments