@@ -59,6 +59,12 @@ def parse_item(self, response):
59
59
item ['price' ] = m .group (1 )
60
60
return item
61
61
62
+
63
+ class TestDupeFilterSpider (TestSpider ):
64
+ def make_requests_from_url (self , url ):
65
+ return Request (url ) # dont_filter=False
66
+
67
+
62
68
def start_test_site (debug = False ):
63
69
root_dir = os .path .join (tests_datadir , "test_site" )
64
70
r = static .File (root_dir )
@@ -75,26 +81,31 @@ def start_test_site(debug=False):
75
81
class CrawlerRun (object ):
76
82
"""A class to run the crawler and keep track of events occurred"""
77
83
78
- def __init__ (self ):
84
+ def __init__ (self , with_dupefilter = False ):
79
85
self .spider = None
80
86
self .respplug = []
81
87
self .reqplug = []
88
+ self .reqdropped = []
82
89
self .itemresp = []
83
90
self .signals_catched = {}
91
+ self .spider_class = TestSpider if not with_dupefilter else \
92
+ TestDupeFilterSpider
84
93
85
94
def run (self ):
86
95
self .port = start_test_site ()
87
96
self .portno = self .port .getHost ().port
88
97
89
- start_urls = [self .geturl ("/" ), self .geturl ("/redirect" )]
98
+ start_urls = [self .geturl ("/" ), self .geturl ("/redirect" ),
99
+ self .geturl ("/redirect" )] # a duplicate
90
100
91
101
for name , signal in vars (signals ).items ():
92
102
if not name .startswith ('_' ):
93
103
dispatcher .connect (self .record_signal , signal )
94
104
95
- self .crawler = get_crawler (TestSpider )
105
+ self .crawler = get_crawler (self . spider_class )
96
106
self .crawler .signals .connect (self .item_scraped , signals .item_scraped )
97
107
self .crawler .signals .connect (self .request_scheduled , signals .request_scheduled )
108
+ self .crawler .signals .connect (self .request_dropped , signals .request_dropped )
98
109
self .crawler .signals .connect (self .response_downloaded , signals .response_downloaded )
99
110
self .crawler .crawl (start_urls = start_urls )
100
111
self .spider = self .crawler .spider
@@ -123,6 +134,9 @@ def item_scraped(self, item, spider, response):
123
134
def request_scheduled (self , request , spider ):
124
135
self .reqplug .append ((request , spider ))
125
136
137
+ def request_dropped (self , request , spider ):
138
+ self .reqdropped .append ((request , spider ))
139
+
126
140
def response_downloaded (self , response , spider ):
127
141
self .respplug .append ((response , spider ))
128
142
@@ -141,10 +155,14 @@ def test_crawler(self):
141
155
self .run = CrawlerRun ()
142
156
yield self .run .run ()
143
157
self ._assert_visited_urls ()
144
- self ._assert_scheduled_requests ()
158
+ self ._assert_scheduled_requests (urls_to_visit = 8 )
145
159
self ._assert_downloaded_responses ()
146
160
self ._assert_scraped_items ()
147
161
self ._assert_signals_catched ()
162
+ self .run = CrawlerRun (with_dupefilter = True )
163
+ yield self .run .run ()
164
+ self ._assert_scheduled_requests (urls_to_visit = 7 )
165
+ self ._assert_dropped_requests ()
148
166
149
167
def _assert_visited_urls (self ):
150
168
must_be_visited = ["/" , "/redirect" , "/redirected" ,
@@ -153,18 +171,26 @@ def _assert_visited_urls(self):
153
171
urls_expected = set ([self .run .geturl (p ) for p in must_be_visited ])
154
172
assert urls_expected <= urls_visited , "URLs not visited: %s" % list (urls_expected - urls_visited )
155
173
156
- def _assert_scheduled_requests (self ):
157
- self .assertEqual (6 , len (self .run .reqplug ))
174
+ def _assert_scheduled_requests (self , urls_to_visit = None ):
175
+ self .assertEqual (urls_to_visit , len (self .run .reqplug ))
158
176
159
177
paths_expected = ['/item999.html' , '/item2.html' , '/item1.html' ]
160
178
161
179
urls_requested = set ([rq [0 ].url for rq in self .run .reqplug ])
162
180
urls_expected = set ([self .run .geturl (p ) for p in paths_expected ])
163
181
assert urls_expected <= urls_requested
182
+ scheduled_requests_count = len (self .run .reqplug )
183
+ dropped_requests_count = len (self .run .reqdropped )
184
+ responses_count = len (self .run .respplug )
185
+ self .assertEqual (scheduled_requests_count ,
186
+ dropped_requests_count + responses_count )
187
+
188
+ def _assert_dropped_requests (self ):
189
+ self .assertEqual (len (self .run .reqdropped ), 1 )
164
190
165
191
def _assert_downloaded_responses (self ):
166
192
# response tests
167
- self .assertEqual (6 , len (self .run .respplug ))
193
+ self .assertEqual (8 , len (self .run .respplug ))
168
194
169
195
for response , _ in self .run .respplug :
170
196
if self .run .getpath (response .url ) == '/item999.html' :
0 commit comments