java2man
diff --git a/‎scrapy/utils/url.py
Lines changed: 144 additions & 16 deletions b/‎scrapy/utils/url.py
Lines changed: 144 additions & 16 deletions
diff --git a/‎tests/test_utils_url.py
Lines changed: 71 additions & 6 deletions b/‎tests/test_utils_url.py
Lines changed: 71 additions & 6 deletions
@@ -7,15 +7,18 @@
 """
 import posixpath
 import re
+import six
 from six.moves.urllib.parse import (ParseResult, urlunparse, urldefrag,
                                     urlparse, parse_qsl, urlencode,
-                                    unquote)
+                                    quote, unquote)
+if six.PY3:
+    from urllib.parse import unquote_to_bytes
 
 # scrapy.utils.url was moved to w3lib.url and import * ensures this
 # move doesn't break old code
 from w3lib.url import *
 from w3lib.url import _safe_chars
-from scrapy.utils.python import to_native_str
+from scrapy.utils.python import to_bytes, to_native_str, to_unicode
 
 
 def url_is_from_any_domain(url, domains):
@@ -37,42 +40,114 @@ def url_has_any_extension(url, extensions):
     return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
 
 
+def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
+    return (
+        to_native_str(parts.scheme),
+        to_native_str(parts.netloc.encode('idna')),
+
+        # default encoding for path component SHOULD be UTF-8
+        quote(to_bytes(parts.path, path_encoding), _safe_chars),
+        quote(to_bytes(parts.params, path_encoding), _safe_chars),
+
+        # encoding of query and fragment follows page encoding
+        # or form-charset (if known and passed)
+        quote(to_bytes(parts.query, encoding), _safe_chars),
+        quote(to_bytes(parts.fragment, encoding), _safe_chars)
+    )
+
+
 def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
                      encoding=None):
     """Canonicalize the given url by applying the following procedures:
 
     - sort query arguments, first by key, then by value
-    - percent encode paths and query arguments. non-ASCII characters are
-      percent-encoded using UTF-8 (RFC-3986)
+    - percent encode paths ; non-ASCII characters are percent-encoded
+      using UTF-8 (RFC-3986)
+    - percent encode query arguments ; non-ASCII characters are percent-encoded
+      using passed `encoding` (UTF-8 by default)
     - normalize all spaces (in query arguments) '+' (plus symbol)
     - normalize percent encodings case (%2f -> %2F)
-    - remove query arguments with blank values (unless keep_blank_values is True)
-    - remove fragments (unless keep_fragments is True)
+    - remove query arguments with blank values (unless `keep_blank_values` is True)
+    - remove fragments (unless `keep_fragments` is True)
 
-    The url passed can be a str or unicode, while the url returned is always a
-    str.
+    The url passed can be bytes or unicode, while the url returned is
+    always a native str (bytes in Python 2, unicode in Python 3).
 
     For examples see the tests in tests/test_utils_url.py
     """
+    # If supplied `encoding` is not compatible with all characters in `url`,
+    # fallback to UTF-8 as safety net.
+    # UTF-8 can handle all Unicode characters,
+    # so we should be covered regarding URL normalization,
+    # if not for proper URL expected by remote website.
+    try:
+        scheme, netloc, path, params, query, fragment = _safe_ParseResult(
+            parse_url(url), encoding=encoding)
+    except UnicodeError as e:
+        if encoding != 'utf8':
+            scheme, netloc, path, params, query, fragment = _safe_ParseResult(
+                parse_url(url), encoding='utf8')
+        else:
+            raise
 
-    scheme, netloc, path, params, query, fragment = parse_url(url)
-    keyvals = parse_qsl(query, keep_blank_values)
+    # 1. decode query-string as UTF-8 (or keep raw bytes),
+    #    sort values,
+    #    and percent-encode them back
+    if not six.PY2:
+        # Python3's urllib.parse.parse_qsl does not work as wanted
+        # for percent-encoded characters that do not match passed encoding,
+        # they get lost.
+        #
+        # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
+        # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
+        #      instead of \xa3 that you get with Python2's parse_qsl)
+        #
+        # what we want here is to keep raw bytes, and percent encode them
+        # so as to preserve whatever encoding what originally used.
+        #
+        # See https://tools.ietf.org/html/rfc3987#section-6.4:
+        #
+        # For example, it is possible to have a URI reference of
+        # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
+        # document name is encoded in iso-8859-1 based on server settings, but
+        # where the fragment identifier is encoded in UTF-8 according to
+        # [XPointer]. The IRI corresponding to the above URI would be (in XML
+        # notation)
+        # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;".
+        # Similar considerations apply to query parts.  The functionality of
+        # IRIs (namely, to be able to include non-ASCII characters) can only be
+        # used if the query part is encoded in UTF-8.
+        keyvals = parse_qsl_to_bytes(query, keep_blank_values)
+    else:
+        keyvals = parse_qsl(query, keep_blank_values)
     keyvals.sort()
     query = urlencode(keyvals)
 
-    # XXX: copied from w3lib.url.safe_url_string to add encoding argument
-    # path = to_native_str(path, encoding)
-    # path = moves.urllib.parse.quote(path, _safe_chars, encoding='latin1') or '/'
+    # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
+    #    and percent-encode path again (this normalizes to upper-case %XX)
+    uqp = _unquotepath(path)
+    path = quote(uqp, _safe_chars) or '/'
 
-    path = safe_url_string(_unquotepath(path)) or '/'
     fragment = '' if not keep_fragments else fragment
+
+    # every part should be safe already
     return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
 
 
 def _unquotepath(path):
     for reserved in ('2f', '2F', '3f', '3F'):
         path = path.replace('%' + reserved, '%25' + reserved.upper())
-    return unquote(path)
+
+    if six.PY3:
+        # standard lib's unquote() does not work in Python 3
+        # for non-UTF-8 percent-escaped characters, they get lost.
+        # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
+        #
+        # unquote_to_bytes() returns raw bytes instead
+        return unquote_to_bytes(path)
+    else:
+        # in Python 2, '%a3' becomes '\xa3', which is what we want
+        return unquote(path)
 
 
 def parse_url(url, encoding=None):
@@ -81,7 +156,60 @@ def parse_url(http://github.com/url,%20encoding=None):
     """
     if isinstance(url, ParseResult):
         return url
-    return urlparse(to_native_str(url, encoding))
+    return urlparse(to_unicode(url, encoding))
+
+
+if six.PY3:
+    from urllib.parse import _coerce_args, unquote_to_bytes
+
+    def parse_qsl_to_bytes(qs, keep_blank_values=False, strict_parsing=False):
+        """Parse a query given as a string argument.
+
+        Data are returned as a list of name, value pairs as bytes.
+
+        Arguments:
+
+        qs: percent-encoded query string to be parsed
+
+        keep_blank_values: flag indicating whether blank values in
+            percent-encoded queries should be treated as blank strings.  A
+            true value indicates that blanks should be retained as blank
+            strings.  The default false value indicates that blank values
+            are to be ignored and treated as if they were  not included.
+
+        strict_parsing: flag indicating what to do with parsing errors. If
+            false (the default), errors are silently ignored. If true,
+            errors raise a ValueError exception.
+
+        """
+        # This code is the same as Python3's parse_qsl()
+        # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
+        # except for the unquote(s, encoding, errors) calls replaced
+        # with unquote_to_bytes(s)
+        qs, _coerce_result = _coerce_args(qs)
+        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+        r = []
+        for name_value in pairs:
+            if not name_value and not strict_parsing:
+                continue
+            nv = name_value.split('=', 1)
+            if len(nv) != 2:
+                if strict_parsing:
+                    raise ValueError("bad query field: %r" % (name_value,))
+                # Handle case of a control-name with no equal sign
+                if keep_blank_values:
+                    nv.append('')
+                else:
+                    continue
+            if len(nv[1]) or keep_blank_values:
+                name = nv[0].replace('+', ' ')
+                name = unquote_to_bytes(name)
+                name = _coerce_result(name)
+                value = nv[1].replace('+', ' ')
+                value = unquote_to_bytes(value)
+                value = _coerce_result(value)
+                r.append((name, value))
+        return r
 
 
 def escape_ajax(url):
 
@@ -2,10 +2,12 @@
 import unittest
 
 import six
+from six.moves.urllib.parse import urlparse
+
 from scrapy.spiders import Spider
 from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
                               canonicalize_url, add_http_if_no_scheme,
-                              guess_scheme)
+                              guess_scheme, parse_url)
 
 __doctests__ = ['scrapy.utils.url']
 
@@ -123,16 +125,55 @@ def test_spaces(self):
         self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
                                           "http://www.example.com/do?a=1&q=a+space")
 
-    @unittest.skipUnless(six.PY2, "TODO")
+    def test_canonicalize_url_unicode_path(self):
+        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9")
+
+    def test_canonicalize_url_unicode_query_string(self):
+        # default encoding for path and query is UTF-8
+        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
+
+        # passed encoding will affect query string
+        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")
+
+        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
+
+    def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
+        # trying to encode with wrong encoding
+        # fallback to UTF-8
+        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")
+
+        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
+
     def test_normalize_percent_encoding_in_paths(self):
+        self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9")
+
+        # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
+        # 'latin1'-encoded sequence in path
         self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
-                                          "http://www.example.com/a%A3do"),
+                                          "http://www.example.com/a%A3do")
+
+        # 'latin1'-encoded path, UTF-8 encoded query string
+        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
+                                          "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
+
+        # 'latin1'-encoded path and query string
+        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
+                                          "http://www.example.com/a%A3do?q=r%E9sum%E9")
 
-    @unittest.skipUnless(six.PY2, "TODO")
     def test_normalize_percent_encoding_in_query_arguments(self):
         self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
                                           "http://www.example.com/do?k=b%A3")
 
+        self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
+                                          "http://www.example.com/do?k=r%C3%A9sum%C3%A9")
+
     def test_non_ascii_percent_encoding_in_paths(self):
         self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
                                           "http://www.example.com/a%20do?a=1"),
@@ -144,7 +185,7 @@ def test_non_ascii_percent_encoding_in_paths(self):
                                           "http://www.example.com/a%20do%C2%A3.html?a=1")
 
     def test_non_ascii_percent_encoding_in_query_arguments(self):
-        self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=%3Cspan%20class="pl-cce x x-first">\xa3500&a=5&z=3"),
+        self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=%3Cspan%20class="x x-first x-last">£500&a=5&z=3"),
                                           u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
         self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
                                           "http://www.example.com/do?a=5&price=%C2%A3500&z=3")
@@ -167,7 +208,6 @@ def test_dont_convert_safe_characters(self):
             "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
             "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
 
-    @unittest.skipUnless(six.PY2, "TODO")
     def test_safe_characters_unicode(self):
         # urllib.quote uses a mapping cache of encoded characters. when parsing
         # an already percent-encoded url, it will fail if that url was not
@@ -181,12 +221,37 @@ def test_domains_are_case_insensitive(self):
         self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
                                           "http://www.example.com/")
 
+    def test_canonicalize_idns(self):
+        self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
+                                           'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
+        # Japanese (+ reordering query parameters)
+        self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
+                                           'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
+
     def test_quoted_slash_and_question_sign(self):
         self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
                          "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
         self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
                          "http://foo.com/AC%2FDC/")
 
+    def test_canonicalize_urlparsed(self):
+        # canonicalize_url() can be passed an already urlparse'd URL
+        self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
+        self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
+                                          'http://www.example.com/caf%E9-con-leche.htm')
+        self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
+                                          "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
+
+    def test_canonicalize_parse_url(self):
+        # parse_url() wraps urlparse and is used in link extractors
+        self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
+                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
+        self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
+                                          'http://www.example.com/caf%E9-con-leche.htm')
+        self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
+                                          "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
+
 
 class AddHttpIfNoScheme(unittest.TestCase):