Skip to content

Commit 1b9b211

Browse files
committed
Merge pull request scrapy#1415 from scrapy/nyov-py3
nyov's PY3 changes
2 parents 57fafc7 + 93accb7 commit 1b9b211

22 files changed

+96
-54
lines changed

requirements-py3.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Twisted >= 15.1.0
2+
lxml>=3.2.4
3+
pyOpenSSL>=0.13.1
4+
cssselect>=0.9
5+
queuelib>=1.1.1
6+
w3lib>=1.8.0

scrapy/cmdline.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ def _iter_command_classes(module_name):
1818
# TODO: add `name` attribute to commands and and merge this function with
1919
# scrapy.utils.spider.iter_spider_classes
2020
for module in walk_modules(module_name):
21-
for obj in vars(module).itervalues():
21+
for obj in vars(module).values():
2222
if inspect.isclass(obj) and \
23-
issubclass(obj, ScrapyCommand) and \
24-
obj.__module__ == module.__name__:
23+
issubclass(obj, ScrapyCommand) and \
24+
obj.__module__ == module.__name__:
2525
yield obj
2626

2727
def _get_commands_from_module(module, inproject):

scrapy/core/downloader/handlers/s3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from urlparse import unquote
1+
from six.moves.urllib.parse import unquote
22

33
from scrapy.exceptions import NotConfigured
44
from scrapy.utils.httpobj import urlparse_cached

scrapy/core/downloader/middleware.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
See documentation in docs/topics/downloader-middleware.rst
55
"""
6-
6+
import six
77
from scrapy.http import Request, Response
88
from scrapy.middleware import MiddlewareManager
99
from scrapy.utils.defer import mustbe_deferred
@@ -32,7 +32,7 @@ def process_request(request):
3232
response = method(request=request, spider=spider)
3333
assert response is None or isinstance(response, (Response, Request)), \
3434
'Middleware %s.process_request must return None, Response or Request, got %s' % \
35-
(method.im_self.__class__.__name__, response.__class__.__name__)
35+
(six.get_method_self(method).__class__.__name__, response.__class__.__name__)
3636
if response:
3737
return response
3838
return download_func(request=request, spider=spider)
@@ -46,7 +46,7 @@ def process_response(response):
4646
response = method(request=request, response=response, spider=spider)
4747
assert isinstance(response, (Response, Request)), \
4848
'Middleware %s.process_response must return Response or Request, got %s' % \
49-
(method.im_self.__class__.__name__, type(response))
49+
(six.get_method_self(method).__class__.__name__, type(response))
5050
if isinstance(response, Request):
5151
return response
5252
return response
@@ -57,7 +57,7 @@ def process_exception(_failure):
5757
response = method(request=request, exception=exception, spider=spider)
5858
assert response is None or isinstance(response, (Response, Request)), \
5959
'Middleware %s.process_exception must return None, Response or Request, got %s' % \
60-
(method.im_self.__class__.__name__, type(response))
60+
(six.get_method_self(method).__class__.__name__, type(response))
6161
if response:
6262
return response
6363
return _failure

scrapy/core/spidermw.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
See documentation in docs/topics/spider-middleware.rst
55
"""
6-
6+
import six
77
from twisted.python.failure import Failure
88
from scrapy.middleware import MiddlewareManager
99
from scrapy.utils.defer import mustbe_deferred
@@ -33,7 +33,9 @@ def _add_middleware(self, mw):
3333
self.methods['process_start_requests'].insert(0, mw.process_start_requests)
3434

3535
def scrape_response(self, scrape_func, response, request, spider):
36-
fname = lambda f:'%s.%s' % (f.im_self.__class__.__name__, f.im_func.__name__)
36+
fname = lambda f:'%s.%s' % (
37+
six.get_method_self(f).__class__.__name__,
38+
six.get_method_function(f).__name__)
3739

3840
def process_spider_input(response):
3941
for method in self.methods['process_spider_input']:

scrapy/linkextractors/htmlparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
import warnings
6-
from HTMLParser import HTMLParser
6+
from six.moves.html_parser import HTMLParser
77
from six.moves.urllib.parse import urljoin
88

99
from w3lib.url import safe_url_string

scrapy/pipelines/files.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from scrapy.http import Request
2727
from scrapy.utils.misc import md5sum
2828
from scrapy.utils.log import failure_to_exc_info
29+
from scrapy.utils.python import to_bytes, to_native_str
2930

3031
logger = logging.getLogger(__name__)
3132

@@ -198,7 +199,7 @@ def _onsuccess(result):
198199
if age_days > self.EXPIRES:
199200
return # returning None force download
200201

201-
referer = request.headers.get('Referer')
202+
referer = _get_referer(request)
202203
logger.debug(
203204
'File (uptodate): Downloaded %(medianame)s from %(request)s '
204205
'referred in <%(referer)s>',
@@ -224,7 +225,7 @@ def _onsuccess(result):
224225

225226
def media_failed(self, failure, request, info):
226227
if not isinstance(failure.value, IgnoreRequest):
227-
referer = request.headers.get('Referer')
228+
referer = _get_referer(request)
228229
logger.warning(
229230
'File (unknown-error): Error downloading %(medianame)s from '
230231
'%(request)s referred in <%(referer)s>: %(exception)s',
@@ -236,7 +237,7 @@ def media_failed(self, failure, request, info):
236237
raise FileException
237238

238239
def media_downloaded(self, response, request, info):
239-
referer = request.headers.get('Referer')
240+
referer = _get_referer(request)
240241

241242
if response.status != 200:
242243
logger.warning(
@@ -330,11 +331,19 @@ def _warn():
330331
return self.file_key(url)
331332
## end of deprecation warning block
332333

333-
media_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
334+
media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
334335
media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
335336
return 'full/%s%s' % (media_guid, media_ext)
336337

337338
# deprecated
338339
def file_key(self, url):
339340
return self.file_path(url)
340341
file_key._base = True
342+
343+
344+
def _get_referer(request):
345+
""" Return Referer HTTP header suitable for logging """
346+
referrer = request.headers.get('Referer')
347+
if referrer is None:
348+
return referrer
349+
return to_native_str(referrer, errors='replace')

scrapy/pipelines/images.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from PIL import Image
1616

1717
from scrapy.utils.misc import md5sum
18+
from scrapy.utils.python import to_bytes
1819
from scrapy.http import Request
1920
from scrapy.exceptions import DropItem
2021
#TODO: from scrapy.pipelines.media import MediaPipeline
@@ -138,7 +139,7 @@ def _warn():
138139
return self.image_key(url)
139140
## end of deprecation warning block
140141

141-
image_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
142+
image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
142143
return 'full/%s.jpg' % (image_guid)
143144

144145
def thumb_path(self, request, thumb_id, response=None, info=None):
@@ -163,7 +164,7 @@ def _warn():
163164
return self.thumb_key(url, thumb_id)
164165
## end of deprecation warning block
165166

166-
thumb_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
167+
thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
167168
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
168169

169170
# deprecated

scrapy/utils/testproc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ class TestProcessProtocol(protocol.ProcessProtocol):
3535

3636
def __init__(self):
3737
self.deferred = defer.Deferred()
38-
self.out = ''
39-
self.err = ''
38+
self.out = b''
39+
self.err = b''
4040
self.exitcode = None
4141

4242
def outReceived(self, data):

tests/py3-ignores.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
tests/test_closespider.py
2-
tests/test_cmdline/__init__.py
32
tests/test_command_fetch.py
43
tests/test_command_shell.py
54
tests/test_commands.py
6-
tests/test_command_version.py
75
tests/test_exporters.py
86
tests/test_linkextractors.py
9-
tests/test_loader.py
107
tests/test_crawl.py
118
tests/test_crawler.py
129
tests/test_downloader_handlers.py

tests/requirements-py3.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
pytest>=2.6.0
2+
pytest-twisted
3+
testfixtures
4+
jmespath

tests/test_cmdline/__init__.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,30 @@
1+
import os
12
import sys
3+
import shutil
4+
import pstats
5+
import tempfile
26
from subprocess import Popen, PIPE
37
import unittest
8+
try:
9+
from cStringIO import StringIO
10+
except ImportError:
11+
from io import StringIO
412

513
from scrapy.utils.test import get_testenv
614

15+
716
class CmdlineTest(unittest.TestCase):
817

918
def setUp(self):
1019
self.env = get_testenv()
1120
self.env['SCRAPY_SETTINGS_MODULE'] = 'tests.test_cmdline.settings'
1221

1322
def _execute(self, *new_args, **kwargs):
23+
encoding = getattr(sys.stdout, 'encoding') or 'utf-8'
1424
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
1525
proc = Popen(args, stdout=PIPE, stderr=PIPE, env=self.env, **kwargs)
16-
comm = proc.communicate()
17-
return comm[0].strip()
26+
comm = proc.communicate()[0].strip()
27+
return comm.decode(encoding)
1828

1929
def test_default_settings(self):
2030
self.assertEqual(self._execute('settings', '--get', 'TEST1'), \
@@ -29,3 +39,18 @@ def test_override_settings_using_envvar(self):
2939
self.assertEqual(self._execute('settings', '--get', 'TEST1'), \
3040
'override')
3141

42+
def test_profiling(self):
43+
path = tempfile.mkdtemp()
44+
filename = os.path.join(path, 'res.prof')
45+
try:
46+
self._execute('version', '--profile', filename)
47+
self.assertTrue(os.path.exists(filename))
48+
out = StringIO()
49+
stats = pstats.Stats(filename, stream=out)
50+
stats.print_stats()
51+
out.seek(0)
52+
stats = out.read()
53+
self.assertIn('scrapy/commands/version.py', stats)
54+
self.assertIn('tottime', stats)
55+
finally:
56+
shutil.rmtree(path)

tests/test_command_version.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
from twisted.trial import unittest
23
from twisted.internet import defer
34

@@ -11,5 +12,6 @@ class VersionTest(ProcessTest, unittest.TestCase):
1112

1213
@defer.inlineCallbacks
1314
def test_output(self):
15+
encoding = getattr(sys.stdout, 'encoding') or 'utf-8'
1416
_, out, _ = yield self.execute([])
15-
self.assertEqual(out.strip(), "Scrapy %s" % scrapy.__version__)
17+
self.assertEqual(out.strip().decode(encoding), "Scrapy %s" % scrapy.__version__)

tests/test_crawl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def test_start_requests_dupes(self):
141141
def test_unbounded_response(self):
142142
# Completeness of responses without Content-Length or Transfer-Encoding
143143
# can not be determined, we treat them as valid but flagged as "partial"
144-
from urllib import urlencode
144+
from six.moves.urllib.parse import urlencode
145145
query = urlencode({'raw': '''\
146146
HTTP/1.1 200 OK
147147
Server: Apache-Coyote/1.1

tests/test_loader.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import unittest
2+
import six
23
from functools import partial
34

45
from scrapy.loader import ItemLoader
@@ -141,7 +142,7 @@ def test_replace_value(self):
141142

142143
def test_get_value(self):
143144
il = NameItemLoader()
144-
self.assertEqual(u'FOO', il.get_value([u'foo', u'bar'], TakeFirst(), unicode.upper))
145+
self.assertEqual(u'FOO', il.get_value([u'foo', u'bar'], TakeFirst(), six.text_type.upper))
145146
self.assertEqual([u'foo', u'bar'], il.get_value([u'name:foo', u'name:bar'], re=u'name:(.*)$'))
146147
self.assertEqual(u'foo', il.get_value([u'name:foo', u'name:bar'], TakeFirst(), re=u'name:(.*)$'))
147148

@@ -242,15 +243,15 @@ class IdentityDefaultedItemLoader(DefaultedItemLoader):
242243

243244
def test_extend_custom_input_processors(self):
244245
class ChildItemLoader(TestItemLoader):
245-
name_in = MapCompose(TestItemLoader.name_in, unicode.swapcase)
246+
name_in = MapCompose(TestItemLoader.name_in, six.text_type.swapcase)
246247

247248
il = ChildItemLoader()
248249
il.add_value('name', u'marta')
249250
self.assertEqual(il.get_output_value('name'), [u'mARTA'])
250251

251252
def test_extend_default_input_processors(self):
252253
class ChildDefaultedItemLoader(DefaultedItemLoader):
253-
name_in = MapCompose(DefaultedItemLoader.default_input_processor, unicode.swapcase)
254+
name_in = MapCompose(DefaultedItemLoader.default_input_processor, six.text_type.swapcase)
254255

255256
il = ChildDefaultedItemLoader()
256257
il.add_value('name', u'marta')
@@ -423,7 +424,7 @@ def test_join(self):
423424
self.assertRaises(TypeError, proc, [None, '', 'hello', 'world'])
424425
self.assertEqual(proc(['', 'hello', 'world']), u' hello world')
425426
self.assertEqual(proc(['hello', 'world']), u'hello world')
426-
self.assert_(isinstance(proc(['hello', 'world']), unicode))
427+
self.assert_(isinstance(proc(['hello', 'world']), six.text_type))
427428

428429
def test_compose(self):
429430
proc = Compose(lambda v: v[0], str.upper)
@@ -435,13 +436,13 @@ def test_compose(self):
435436

436437
def test_mapcompose(self):
437438
filter_world = lambda x: None if x == 'world' else x
438-
proc = MapCompose(filter_world, unicode.upper)
439+
proc = MapCompose(filter_world, six.text_type.upper)
439440
self.assertEqual(proc([u'hello', u'world', u'this', u'is', u'scrapy']),
440441
[u'HELLO', u'THIS', u'IS', u'SCRAPY'])
441442

442443

443444
class SelectortemLoaderTest(unittest.TestCase):
444-
response = HtmlResponse(url="", body="""
445+
response = HtmlResponse(url="", encoding='utf-8', body=b"""
445446
<html>
446447
<body>
447448
<div id="id">marta</div>

tests/test_pipeline_files.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from scrapy.item import Item, Field
1313
from scrapy.http import Request, Response
1414
from scrapy.settings import Settings
15+
from scrapy.utils.python import to_bytes
1516

1617
from tests import mock
1718

@@ -103,7 +104,7 @@ def test_file_expired(self):
103104

104105
class DeprecatedFilesPipeline(FilesPipeline):
105106
def file_key(self, url):
106-
media_guid = hashlib.sha1(url).hexdigest()
107+
media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
107108
media_ext = os.path.splitext(url)[1]
108109
return 'empty/%s%s' % (media_guid, media_ext)
109110

tests/test_pipeline_images.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from scrapy.http import Request, Response
1111
from scrapy.settings import Settings
1212
from scrapy.pipelines.images import ImagesPipeline
13+
from scrapy.utils.python import to_bytes
1314

1415
skip = False
1516
try:
@@ -100,11 +101,11 @@ def file_key(self, url):
100101
return self.image_key(url)
101102

102103
def image_key(self, url):
103-
image_guid = hashlib.sha1(url).hexdigest()
104+
image_guid = hashlib.sha1(to_bytes(url)).hexdigest()
104105
return 'empty/%s.jpg' % (image_guid)
105106

106107
def thumb_key(self, url, thumb_id):
107-
thumb_guid = hashlib.sha1(url).hexdigest()
108+
thumb_guid = hashlib.sha1(to_bytes(url)).hexdigest()
108109
return 'thumbsup/%s/%s.jpg' % (thumb_id, thumb_guid)
109110

110111

tests/test_selector_csstranslator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from cssselect.xpath import ExpressionError
1010

1111

12-
HTMLBODY = '''
12+
HTMLBODY = b'''
1313
<html>
1414
<body>
1515
<div>

0 commit comments

Comments
 (0)