Skip to content

Commit fa123b3

Browse files
committed
Merge pull request scrapy#1421 from dangra/nyov/lazyload-downloadhandlers
lazy-load downloadhandlers (continuation of scrapy#1357)
2 parents fa3d84b + 7717501 commit fa123b3

File tree

2 files changed

+53
-17
lines changed

2 files changed

+53
-17
lines changed

scrapy/core/downloader/handlers/__init__.py

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Download handlers for different schemes"""
22

3+
import logging
34
from twisted.internet import defer
45
import six
56
from scrapy.exceptions import NotSupported, NotConfigured
@@ -8,37 +9,63 @@
89
from scrapy import signals
910

1011

12+
logger = logging.getLogger(__name__)
13+
14+
1115
class DownloadHandlers(object):
1216

1317
def __init__(self, crawler):
14-
self._handlers = {}
15-
self._notconfigured = {}
18+
self._crawler = crawler
19+
self._schemes = {} # stores acceptable schemes on instancing
20+
self._handlers = {} # stores instanced handlers for schemes
21+
self._notconfigured = {} # remembers failed handlers
1622
handlers = crawler.settings.get('DOWNLOAD_HANDLERS_BASE')
1723
handlers.update(crawler.settings.get('DOWNLOAD_HANDLERS', {}))
1824
for scheme, clspath in six.iteritems(handlers):
1925
# Allow to disable a handler just like any other
2026
# component (extension, middleware, etc).
2127
if clspath is None:
2228
continue
23-
cls = load_object(clspath)
24-
try:
25-
dh = cls(crawler.settings)
26-
except NotConfigured as ex:
27-
self._notconfigured[scheme] = str(ex)
28-
else:
29-
self._handlers[scheme] = dh
29+
self._schemes[scheme] = clspath
3030

3131
crawler.signals.connect(self._close, signals.engine_stopped)
3232

33+
def _get_handler(self, scheme):
34+
"""Lazy-load the downloadhandler for a scheme
35+
only on the first request for that scheme.
36+
"""
37+
if scheme in self._handlers:
38+
return self._handlers[scheme]
39+
if scheme in self._notconfigured:
40+
return None
41+
if scheme not in self._schemes:
42+
self._notconfigured[scheme] = 'no handler available for that scheme'
43+
return None
44+
45+
path = self._schemes[scheme]
46+
try:
47+
dhcls = load_object(path)
48+
dh = dhcls(self._crawler.settings)
49+
except NotConfigured as ex:
50+
self._notconfigured[scheme] = str(ex)
51+
return None
52+
except Exception as ex:
53+
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
54+
{"clspath": path, "scheme": scheme},
55+
exc_info=True, extra={'crawler': self._crawler})
56+
self._notconfigured[scheme] = str(ex)
57+
return None
58+
else:
59+
self._handlers[scheme] = dh
60+
return self._handlers[scheme]
61+
3362
def download_request(self, request, spider):
3463
scheme = urlparse_cached(request).scheme
35-
try:
36-
handler = self._handlers[scheme].download_request
37-
except KeyError:
38-
msg = self._notconfigured.get(scheme, \
39-
'no handler available for that scheme')
40-
raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg))
41-
return handler(request, spider)
64+
handler = self._get_handler(scheme)
65+
if not handler:
66+
raise NotSupported("Unsupported URL scheme '%s': %s" %
67+
(scheme, self._notconfigured[scheme]))
68+
return handler.download_request(request, spider)
4269

4370
@defer.inlineCallbacks
4471
def _close(self, *_a, **_kw):

tests/test_downloader_handlers.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,22 +52,31 @@ def test_enabled_handler(self):
5252
handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'}
5353
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
5454
dh = DownloadHandlers(crawler)
55+
self.assertIn('scheme', dh._schemes)
56+
for scheme in handlers: # force load handlers
57+
dh._get_handler(scheme)
5558
self.assertIn('scheme', dh._handlers)
5659
self.assertNotIn('scheme', dh._notconfigured)
5760

5861
def test_not_configured_handler(self):
5962
handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
6063
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
6164
dh = DownloadHandlers(crawler)
65+
self.assertIn('scheme', dh._schemes)
66+
for scheme in handlers: # force load handlers
67+
dh._get_handler(scheme)
6268
self.assertNotIn('scheme', dh._handlers)
6369
self.assertIn('scheme', dh._notconfigured)
6470

6571
def test_disabled_handler(self):
6672
handlers = {'scheme': None}
6773
crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
6874
dh = DownloadHandlers(crawler)
75+
self.assertNotIn('scheme', dh._schemes)
76+
for scheme in handlers: # force load handlers
77+
dh._get_handler(scheme)
6978
self.assertNotIn('scheme', dh._handlers)
70-
self.assertNotIn('scheme', dh._notconfigured)
79+
self.assertIn('scheme', dh._notconfigured)
7180

7281

7382
class FileTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)