|
1 | 1 | """Download handlers for different schemes"""
|
2 | 2 |
|
| 3 | +import logging |
3 | 4 | from twisted.internet import defer
|
4 | 5 | import six
|
5 | 6 | from scrapy.exceptions import NotSupported, NotConfigured
|
|
8 | 9 | from scrapy import signals
|
9 | 10 |
|
10 | 11 |
|
| 12 | +logger = logging.getLogger(__name__) |
| 13 | + |
| 14 | + |
11 | 15 | class DownloadHandlers(object):
|
12 | 16 |
|
13 | 17 | def __init__(self, crawler):
|
14 |
| - self._handlers = {} |
15 |
| - self._notconfigured = {} |
| 18 | + self._crawler = crawler |
| 19 | + self._schemes = {} # stores acceptable schemes on instancing |
| 20 | + self._handlers = {} # stores instanced handlers for schemes |
| 21 | + self._notconfigured = {} # remembers failed handlers |
16 | 22 | handlers = crawler.settings.get('DOWNLOAD_HANDLERS_BASE')
|
17 | 23 | handlers.update(crawler.settings.get('DOWNLOAD_HANDLERS', {}))
|
18 | 24 | for scheme, clspath in six.iteritems(handlers):
|
19 | 25 | # Allow to disable a handler just like any other
|
20 | 26 | # component (extension, middleware, etc).
|
21 | 27 | if clspath is None:
|
22 | 28 | continue
|
23 |
| - cls = load_object(clspath) |
24 |
| - try: |
25 |
| - dh = cls(crawler.settings) |
26 |
| - except NotConfigured as ex: |
27 |
| - self._notconfigured[scheme] = str(ex) |
28 |
| - else: |
29 |
| - self._handlers[scheme] = dh |
| 29 | + self._schemes[scheme] = clspath |
30 | 30 |
|
31 | 31 | crawler.signals.connect(self._close, signals.engine_stopped)
|
32 | 32 |
|
| 33 | + def _get_handler(self, scheme): |
| 34 | + """Lazy-load the downloadhandler for a scheme |
| 35 | + only on the first request for that scheme. |
| 36 | + """ |
| 37 | + if scheme in self._handlers: |
| 38 | + return self._handlers[scheme] |
| 39 | + if scheme in self._notconfigured: |
| 40 | + return None |
| 41 | + if scheme not in self._schemes: |
| 42 | + self._notconfigured[scheme] = 'no handler available for that scheme' |
| 43 | + return None |
| 44 | + |
| 45 | + path = self._schemes[scheme] |
| 46 | + try: |
| 47 | + dhcls = load_object(path) |
| 48 | + dh = dhcls(self._crawler.settings) |
| 49 | + except NotConfigured as ex: |
| 50 | + self._notconfigured[scheme] = str(ex) |
| 51 | + return None |
| 52 | + except Exception as ex: |
| 53 | + logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"', |
| 54 | + {"clspath": path, "scheme": scheme}, |
| 55 | + exc_info=True, extra={'crawler': self._crawler}) |
| 56 | + self._notconfigured[scheme] = str(ex) |
| 57 | + return None |
| 58 | + else: |
| 59 | + self._handlers[scheme] = dh |
| 60 | + return self._handlers[scheme] |
| 61 | + |
33 | 62 | def download_request(self, request, spider):
|
34 | 63 | scheme = urlparse_cached(request).scheme
|
35 |
| - try: |
36 |
| - handler = self._handlers[scheme].download_request |
37 |
| - except KeyError: |
38 |
| - msg = self._notconfigured.get(scheme, \ |
39 |
| - 'no handler available for that scheme') |
40 |
| - raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg)) |
41 |
| - return handler(request, spider) |
| 64 | + handler = self._get_handler(scheme) |
| 65 | + if not handler: |
| 66 | + raise NotSupported("Unsupported URL scheme '%s': %s" % |
| 67 | + (scheme, self._notconfigured[scheme])) |
| 68 | + return handler.download_request(request, spider) |
42 | 69 |
|
43 | 70 | @defer.inlineCallbacks
|
44 | 71 | def _close(self, *_a, **_kw):
|
|
0 commit comments