Skip to content

Commit 1b03b12

Browse files
committed
Merge pull request scrapy#961 from ldmberman/task_977_request_dropped
An attempt to resolve scrapy#957, add signal to be sent when request is dropped by the scheduler
2 parents c31fb87 + fdb6bb0 commit 1b03b12

File tree

5 files changed

+56
-9
lines changed

5 files changed

+56
-9
lines changed

docs/topics/signals.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,23 @@ request_scheduled
200200
:param spider: the spider that yielded the request
201201
:type spider: :class:`~scrapy.spider.Spider` object
202202

203+
request_dropped
204+
-----------------
205+
206+
.. signal:: request_dropped
207+
.. function:: request_dropped(request, spider)
208+
209+
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
210+
downloaded later, is rejected by the scheduler.
211+
212+
The signal does not support returning deferreds from their handlers.
213+
214+
:param request: the request that reached the scheduler
215+
:type request: :class:`~scrapy.http.Request` object
216+
217+
:param spider: the spider that yielded the request
218+
:type spider: :class:`~scrapy.spider.Spider` object
219+
203220
response_received
204221
-----------------
205222

scrapy/core/engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,9 @@ def crawl(self, request, spider):
173173
def schedule(self, request, spider):
174174
self.signals.send_catch_log(signal=signals.request_scheduled,
175175
request=request, spider=spider)
176-
return self.slot.scheduler.enqueue_request(request)
176+
if not self.slot.scheduler.enqueue_request(request):
177+
self.signals.send_catch_log(signal=signals.request_dropped,
178+
request=request, spider=spider)
177179

178180
def download(self, request, spider):
179181
slot = self.slot

scrapy/core/scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ def close(self, reason):
4747
def enqueue_request(self, request):
4848
if not request.dont_filter and self.df.request_seen(request):
4949
self.df.log(request, self.spider)
50-
return
50+
return False
5151
dqok = self._dqpush(request)
5252
if dqok:
5353
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
5454
else:
5555
self._mqpush(request)
5656
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
5757
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
58+
return True
5859

5960
def next_request(self):
6061
request = self.mqs.pop()

scrapy/signals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
spider_closed = object()
1313
spider_error = object()
1414
request_scheduled = object()
15+
request_dropped = object()
1516
response_received = object()
1617
response_downloaded = object()
1718
item_scraped = object()

tests/test_engine.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ def parse_item(self, response):
5959
item['price'] = m.group(1)
6060
return item
6161

62+
63+
class TestDupeFilterSpider(TestSpider):
64+
def make_requests_from_url(self, url):
65+
return Request(url) # dont_filter=False
66+
67+
6268
def start_test_site(debug=False):
6369
root_dir = os.path.join(tests_datadir, "test_site")
6470
r = static.File(root_dir)
@@ -75,26 +81,31 @@ def start_test_site(debug=False):
7581
class CrawlerRun(object):
7682
"""A class to run the crawler and keep track of events occurred"""
7783

78-
def __init__(self):
84+
def __init__(self, with_dupefilter=False):
7985
self.spider = None
8086
self.respplug = []
8187
self.reqplug = []
88+
self.reqdropped = []
8289
self.itemresp = []
8390
self.signals_catched = {}
91+
self.spider_class = TestSpider if not with_dupefilter else \
92+
TestDupeFilterSpider
8493

8594
def run(self):
8695
self.port = start_test_site()
8796
self.portno = self.port.getHost().port
8897

89-
start_urls = [self.geturl("/"), self.geturl("/redirect")]
98+
start_urls = [self.geturl("/"), self.geturl("/redirect"),
99+
self.geturl("/redirect")] # a duplicate
90100

91101
for name, signal in vars(signals).items():
92102
if not name.startswith('_'):
93103
dispatcher.connect(self.record_signal, signal)
94104

95-
self.crawler = get_crawler(TestSpider)
105+
self.crawler = get_crawler(self.spider_class)
96106
self.crawler.signals.connect(self.item_scraped, signals.item_scraped)
97107
self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled)
108+
self.crawler.signals.connect(self.request_dropped, signals.request_dropped)
98109
self.crawler.signals.connect(self.response_downloaded, signals.response_downloaded)
99110
self.crawler.crawl(start_urls=start_urls)
100111
self.spider = self.crawler.spider
@@ -123,6 +134,9 @@ def item_scraped(self, item, spider, response):
123134
def request_scheduled(self, request, spider):
124135
self.reqplug.append((request, spider))
125136

137+
def request_dropped(self, request, spider):
138+
self.reqdropped.append((request, spider))
139+
126140
def response_downloaded(self, response, spider):
127141
self.respplug.append((response, spider))
128142

@@ -141,10 +155,14 @@ def test_crawler(self):
141155
self.run = CrawlerRun()
142156
yield self.run.run()
143157
self._assert_visited_urls()
144-
self._assert_scheduled_requests()
158+
self._assert_scheduled_requests(urls_to_visit=8)
145159
self._assert_downloaded_responses()
146160
self._assert_scraped_items()
147161
self._assert_signals_catched()
162+
self.run = CrawlerRun(with_dupefilter=True)
163+
yield self.run.run()
164+
self._assert_scheduled_requests(urls_to_visit=7)
165+
self._assert_dropped_requests()
148166

149167
def _assert_visited_urls(self):
150168
must_be_visited = ["/", "/redirect", "/redirected",
@@ -153,18 +171,26 @@ def _assert_visited_urls(self):
153171
urls_expected = set([self.run.geturl(p) for p in must_be_visited])
154172
assert urls_expected <= urls_visited, "URLs not visited: %s" % list(urls_expected - urls_visited)
155173

156-
def _assert_scheduled_requests(self):
157-
self.assertEqual(6, len(self.run.reqplug))
174+
def _assert_scheduled_requests(self, urls_to_visit=None):
175+
self.assertEqual(urls_to_visit, len(self.run.reqplug))
158176

159177
paths_expected = ['/item999.html', '/item2.html', '/item1.html']
160178

161179
urls_requested = set([rq[0].url for rq in self.run.reqplug])
162180
urls_expected = set([self.run.geturl(p) for p in paths_expected])
163181
assert urls_expected <= urls_requested
182+
scheduled_requests_count = len(self.run.reqplug)
183+
dropped_requests_count = len(self.run.reqdropped)
184+
responses_count = len(self.run.respplug)
185+
self.assertEqual(scheduled_requests_count,
186+
dropped_requests_count + responses_count)
187+
188+
def _assert_dropped_requests(self):
189+
self.assertEqual(len(self.run.reqdropped), 1)
164190

165191
def _assert_downloaded_responses(self):
166192
# response tests
167-
self.assertEqual(6, len(self.run.respplug))
193+
self.assertEqual(8, len(self.run.respplug))
168194

169195
for response, _ in self.run.respplug:
170196
if self.run.getpath(response.url) == '/item999.html':

0 commit comments

Comments
 (0)