Skip to content

Commit e04b0af

Browse files
committed
An attempt to resolve scrapy#977, add signal to be sent when request is dropped by the scheduler
1 parent c31fb87 commit e04b0af

File tree

5 files changed

+33
-2
lines changed

5 files changed

+33
-2
lines changed

docs/topics/signals.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,23 @@ request_scheduled
200200
:param spider: the spider that yielded the request
201201
:type spider: :class:`~scrapy.spider.Spider` object
202202

203+
request_dropped
204+
-----------------
205+
206+
.. signal:: request_dropped
207+
.. function:: request_dropped(request, spider)
208+
209+
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
210+
downloaded later, is rejected by the scheduler.
211+
212+
The signal does not support returning deferreds from their handlers.
213+
214+
:param request: the request that reached the scheduler
215+
:type request: :class:`~scrapy.http.Request` object
216+
217+
:param spider: the spider that yielded the request
218+
:type spider: :class:`~scrapy.spider.Spider` object
219+
203220
response_received
204221
-----------------
205222

scrapy/core/engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,9 @@ def crawl(self, request, spider):
173173
def schedule(self, request, spider):
174174
self.signals.send_catch_log(signal=signals.request_scheduled,
175175
request=request, spider=spider)
176-
return self.slot.scheduler.enqueue_request(request)
176+
if not self.slot.scheduler.enqueue_request(request):
177+
self.signals.send_catch_log(signal=signals.request_dropped,
178+
request=request, spider=spider)
177179

178180
def download(self, request, spider):
179181
slot = self.slot

scrapy/core/scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ def close(self, reason):
4747
def enqueue_request(self, request):
4848
if not request.dont_filter and self.df.request_seen(request):
4949
self.df.log(request, self.spider)
50-
return
50+
return False
5151
dqok = self._dqpush(request)
5252
if dqok:
5353
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
5454
else:
5555
self._mqpush(request)
5656
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
5757
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
58+
return True
5859

5960
def next_request(self):
6061
request = self.mqs.pop()

scrapy/signals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
spider_closed = object()
1313
spider_error = object()
1414
request_scheduled = object()
15+
request_dropped = object()
1516
response_received = object()
1617
response_downloaded = object()
1718
item_scraped = object()

tests/test_engine.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def __init__(self):
7979
self.spider = None
8080
self.respplug = []
8181
self.reqplug = []
82+
self.reqdropped = []
8283
self.itemresp = []
8384
self.signals_catched = {}
8485

@@ -95,6 +96,7 @@ def run(self):
9596
self.crawler = get_crawler(TestSpider)
9697
self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
9798
self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled)
99+
self.crawler.signals.connect(self.request_dropped, signals.request_dropped)
98100
self.crawler.signals.connect(self.response_downloaded, signals.response_downloaded)
99101
self.crawler.crawl(start_urls=start_urls)
100102
self.spider = self.crawler.spider
@@ -123,6 +125,9 @@ def item_scraped(self, item, spider, response):
123125
def request_scheduled(self, request, spider):
124126
self.reqplug.append((request, spider))
125127

128+
def request_dropped(self, request, spider):
129+
self.reqdropped.append((request, spider))
130+
126131
def response_downloaded(self, response, spider):
127132
self.respplug.append((response, spider))
128133

@@ -161,6 +166,11 @@ def _assert_scheduled_requests(self):
161166
urls_requested = set([rq[0].url for rq in self.run.reqplug])
162167
urls_expected = set([self.run.geturl(p) for p in paths_expected])
163168
assert urls_expected <= urls_requested
169+
scheduled_requests_count = len(self.run.reqplug)
170+
dropped_requests_count = len(self.run.reqdropped)
171+
responses_count = len(self.run.respplug)
172+
self.assertEqual(scheduled_requests_count,
173+
dropped_requests_count + responses_count)
164174

165175
def _assert_downloaded_responses(self):
166176
# response tests

0 commit comments

Comments
 (0)