Skip to content

Commit 89faf52

Browse files
committed
Fix start_requests lazyness and early hangs
- Removes new public methods added by scrapy#330 to Crawler and CrawlerProcess - Add test for start_requests lazy evaluation - Fix and test hangs when start_requests erroed before returning the generator - Add test when start_requests fails while generating requests - Simplify Crawler and CrawlerProcess implementation taking in count that only one spider can be attached per Crawler. As required by SEP-019 - "scrapy settings" command do not require starting a Crawler anymore
1 parent 614627b commit 89faf52

File tree

5 files changed

+114
-81
lines changed

5 files changed

+114
-81
lines changed

scrapy/commands/settings.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@ def add_options(self, parser):
2525
help="print setting value, intepreted as an float")
2626

2727
def run(self, args, opts):
28-
crawler = self.crawler_process.create_crawler()
29-
crawler.configure()
30-
settings = crawler.settings
28+
settings = self.crawler_process.settings
3129
if opts.get:
3230
print settings.get(opts.get)
3331
elif opts.getbool:

scrapy/crawler.py

Lines changed: 56 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,20 @@ def __init__(self, settings):
1919
self.settings = settings
2020
self.signals = SignalManager(self)
2121
self.stats = load_object(settings['STATS_CLASS'])(self)
22-
22+
self._start_requests = lambda: ()
23+
self._spider = None
24+
# TODO: move SpiderManager to CrawlerProcess
2325
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
2426
self.spiders = spman_cls.from_crawler(self)
25-
self._scheduled = {}
2627

2728
def install(self):
29+
# TODO: remove together with scrapy.project.crawler usage
2830
import scrapy.project
2931
assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
3032
scrapy.project.crawler = self
3133

3234
def uninstall(self):
35+
# TODO: remove together with scrapy.project.crawler usage
3336
import scrapy.project
3437
assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
3538
del scrapy.project.crawler
@@ -45,19 +48,13 @@ def configure(self):
4548
self.engine = ExecutionEngine(self, self._spider_closed)
4649

4750
def crawl(self, spider, requests=None):
51+
assert self._spider is None, 'Spider already attached'
52+
self._spider = spider
4853
spider.set_crawler(self)
49-
if self.configured and self.engine.running:
50-
assert not self._scheduled
51-
return self._schedule(spider, requests)
52-
elif requests is None:
53-
self._scheduled[spider] = None
54+
if requests is None:
55+
self._start_requests = spider.start_requests
5456
else:
55-
self._scheduled.setdefault(spider, []).append(requests)
56-
57-
def _schedule(self, spider, batches=()):
58-
requests = chain.from_iterable(batches) \
59-
if batches else spider.start_requests()
60-
return self.engine.open_spider(spider, requests)
57+
self._start_requests = lambda: requests
6158

6259
def _spider_closed(self, spider=None):
6360
if not self.engine.open_spiders:
@@ -66,47 +63,40 @@ def _spider_closed(self, spider=None):
6663
@defer.inlineCallbacks
6764
def start(self):
6865
yield defer.maybeDeferred(self.configure)
69-
70-
for spider, batches in self._scheduled.iteritems():
71-
yield self._schedule(spider, batches)
72-
66+
if self._spider:
67+
yield self.engine.open_spider(self._spider, self._start_requests())
7368
yield defer.maybeDeferred(self.engine.start)
7469

7570
@defer.inlineCallbacks
7671
def stop(self):
77-
if self.engine.running:
72+
if self.configured and self.engine.running:
7873
yield defer.maybeDeferred(self.engine.stop)
7974

8075

81-
class ProcessMixin(object):
82-
""" Mixin which provides automatic control of the Twisted reactor and
83-
installs some convenient signals for shutting it down
84-
"""
76+
class CrawlerProcess(object):
77+
""" A class to run multiple scrapy crawlers in a process sequentially"""
8578

86-
def __init__(self, *a, **kw):
79+
def __init__(self, settings):
8780
install_shutdown_handlers(self._signal_shutdown)
81+
self.settings = settings
82+
self.crawlers = {}
83+
self.stopping = False
84+
85+
def create_crawler(self, name=None):
86+
if name not in self.crawlers:
87+
self.crawlers[name] = Crawler(self.settings)
88+
89+
return self.crawlers[name]
8890

8991
def start(self):
9092
if self.start_crawling():
9193
self.start_reactor()
9294

93-
def start_reactor(self):
94-
if self.settings.getbool('DNSCACHE_ENABLED'):
95-
reactor.installResolver(CachingThreadedResolver(reactor))
96-
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
97-
reactor.run(installSignalHandlers=False) # blocking call
98-
99-
def start_crawling(self):
100-
raise NotImplementedError
101-
95+
@defer.inlineCallbacks
10296
def stop(self):
103-
raise NotImplementedError
104-
105-
def stop_reactor(self, _=None):
106-
try:
107-
reactor.stop()
108-
except RuntimeError: # raised if already stopped or in shutdown stage
109-
pass
97+
self.stopping = True
98+
for crawler in self.crawlers.itervalues():
99+
yield crawler.stop()
110100

111101
def _signal_shutdown(self, signum, _):
112102
install_shutdown_handlers(self._signal_kill)
@@ -120,27 +110,26 @@ def _signal_kill(self, signum, _):
120110
signame = signal_names[signum]
121111
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
122112
level=log.INFO, signame=signame)
123-
reactor.callFromThread(self.stop_reactor)
124-
125-
126-
class CrawlerProcess(ProcessMixin):
127-
""" A class to run multiple scrapy crawlers in a process sequentially
128-
"""
129-
130-
def __init__(self, settings):
131-
super(CrawlerProcess, self).__init__(settings)
132-
133-
self.settings = settings
134-
self.crawlers = {}
135-
self.stopping = False
136-
137-
def create_crawler(self, name=None):
138-
if name not in self.crawlers:
139-
self.crawlers[name] = Crawler(self.settings)
113+
reactor.callFromThread(self._stop_reactor)
114+
115+
# ------------------------------------------------------------------------#
116+
# The following public methods can't be considered stable and may change at
117+
# any moment.
118+
#
119+
# start_crawling and start_reactor are called from scrapy.commands.shell
120+
# They are splitted because reactor is started on a different thread than IPython shell.
121+
#
122+
def start_crawling(self):
123+
log.scrapy_info(self.settings)
124+
return self._start_crawler() is not None
140125

141-
return self.crawlers[name]
126+
def start_reactor(self):
127+
if self.settings.getbool('DNSCACHE_ENABLED'):
128+
reactor.installResolver(CachingThreadedResolver(reactor))
129+
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
130+
reactor.run(installSignalHandlers=False) # blocking call
142131

143-
def start_crawler(self):
132+
def _start_crawler(self):
144133
if self.crawlers and not self.stopping:
145134
name, crawler = self.crawlers.popitem()
146135

@@ -151,23 +140,17 @@ def start_crawler(self):
151140
if sflo:
152141
crawler.signals.connect(sflo.stop, signals.engine_stopped)
153142

154-
crawler.signals.connect(self.check_done, signals.engine_stopped)
143+
crawler.signals.connect(self._check_done, signals.engine_stopped)
155144
crawler.start()
156145

157146
return name, crawler
158147

159-
def check_done(self, **kwargs):
160-
if not self.start_crawler():
161-
self.stop_reactor()
162-
163-
def start_crawling(self):
164-
log.scrapy_info(self.settings)
165-
return self.start_crawler() is not None
166-
167-
@defer.inlineCallbacks
168-
def stop(self):
169-
self.stopping = True
148+
def _check_done(self, **kwargs):
149+
if not self._start_crawler():
150+
self._stop_reactor()
170151

171-
for crawler in self.crawlers.itervalues():
172-
if crawler.configured:
173-
yield crawler.stop()
152+
def _stop_reactor(self, _=None):
153+
try:
154+
reactor.stop()
155+
except RuntimeError: # raised if already stopped or in shutdown stage
156+
pass

scrapy/tests/spiders.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,32 @@ def parse(self, response):
103103
for request in super(ErrorSpider, self).parse(response):
104104
yield request
105105
self.raise_exception()
106+
107+
108+
class BrokenStartRequestsSpider(FollowAllSpider):
109+
110+
fail_before_yield = False
111+
fail_yielding = False
112+
113+
def __init__(self, *a, **kw):
114+
super(BrokenStartRequestsSpider, self).__init__(*a, **kw)
115+
self.seedsseen = []
116+
117+
def start_requests(self):
118+
if self.fail_before_yield:
119+
1 / 0
120+
121+
for s in xrange(100):
122+
qargs = {'total': 10, 'seed': s}
123+
url = "http://localhost:8998/follow?%s" % urlencode(qargs, doseq=1)
124+
yield Request(url, meta={'seed': s})
125+
if self.fail_yielding:
126+
2 / 0
127+
128+
assert self.seedsseen, \
129+
'All start requests consumed before any download happened'
130+
131+
def parse(self, response):
132+
self.seedsseen.append(response.meta.get('seed'))
133+
for req in super(BrokenStartRequestsSpider, self).parse(response):
134+
yield req

scrapy/tests/test_cmdline/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ def _execute(self, *new_args, **kwargs):
1818

1919
def test_default_settings(self):
2020
self.assertEqual(self._execute('settings', '--get', 'TEST1'), \
21-
'default + started')
21+
'default')
2222

2323
def test_override_settings_using_set_arg(self):
2424
self.assertEqual(self._execute('settings', '--get', 'TEST1', '-s', 'TEST1=override'), \
25-
'override + started')
25+
'override')
2626

2727
def test_override_settings_using_envvar(self):
2828
self.env['SCRAPY_TEST1'] = 'override'
2929
self.assertEqual(self._execute('settings', '--get', 'TEST1'), \
30-
'override + started')
30+
'override')
3131

scrapy/tests/test_crawl.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from twisted.internet import defer
22
from twisted.trial.unittest import TestCase
33
from scrapy.utils.test import get_crawler, get_testlog
4-
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider
4+
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
5+
BrokenStartRequestsSpider
56
from scrapy.tests.mockserver import MockServer
67

78

89
def docrawl(spider, settings=None):
910
crawler = get_crawler(settings)
10-
crawler.configure()
1111
crawler.crawl(spider)
1212
return crawler.start()
1313

@@ -90,6 +90,29 @@ def test_retry_dns_error(self):
9090
yield docrawl(spider)
9191
self._assert_retried()
9292

93+
@defer.inlineCallbacks
94+
def test_start_requests_bug_before_yield(self):
95+
spider = BrokenStartRequestsSpider(fail_before_yield=1)
96+
yield docrawl(spider)
97+
errors = self.flushLoggedErrors(ZeroDivisionError)
98+
self.assertEqual(len(errors), 1)
99+
100+
@defer.inlineCallbacks
101+
def test_start_requests_bug_yielding(self):
102+
spider = BrokenStartRequestsSpider(fail_yielding=1)
103+
yield docrawl(spider)
104+
errors = self.flushLoggedErrors(ZeroDivisionError)
105+
self.assertEqual(len(errors), 1)
106+
107+
@defer.inlineCallbacks
108+
def test_start_requests_lazyness(self):
109+
settings = {"CONCURRENT_REQUESTS": 1}
110+
spider = BrokenStartRequestsSpider()
111+
yield docrawl(spider, settings)
112+
#self.assertTrue(False, spider.seedsseen)
113+
#self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
114+
# spider.seedsseen)
115+
93116
@defer.inlineCallbacks
94117
def test_unbounded_response(self):
95118
# Completeness of responses without Content-Length or Transfer-Encoding

0 commit comments

Comments
 (0)