@@ -19,17 +19,20 @@ def __init__(self, settings):
19
19
self .settings = settings
20
20
self .signals = SignalManager (self )
21
21
self .stats = load_object (settings ['STATS_CLASS' ])(self )
22
-
22
+ self ._start_requests = lambda : ()
23
+ self ._spider = None
24
+ # TODO: move SpiderManager to CrawlerProcess
23
25
spman_cls = load_object (self .settings ['SPIDER_MANAGER_CLASS' ])
24
26
self .spiders = spman_cls .from_crawler (self )
25
- self ._scheduled = {}
26
27
27
28
def install (self ):
29
+ # TODO: remove together with scrapy.project.crawler usage
28
30
import scrapy .project
29
31
assert not hasattr (scrapy .project , 'crawler' ), "crawler already installed"
30
32
scrapy .project .crawler = self
31
33
32
34
def uninstall (self ):
35
+ # TODO: remove together with scrapy.project.crawler usage
33
36
import scrapy .project
34
37
assert hasattr (scrapy .project , 'crawler' ), "crawler not installed"
35
38
del scrapy .project .crawler
@@ -45,19 +48,13 @@ def configure(self):
45
48
self .engine = ExecutionEngine (self , self ._spider_closed )
46
49
47
50
def crawl (self , spider , requests = None ):
51
+ assert self ._spider is None , 'Spider already attached'
52
+ self ._spider = spider
48
53
spider .set_crawler (self )
49
- if self .configured and self .engine .running :
50
- assert not self ._scheduled
51
- return self ._schedule (spider , requests )
52
- elif requests is None :
53
- self ._scheduled [spider ] = None
54
+ if requests is None :
55
+ self ._start_requests = spider .start_requests
54
56
else :
55
- self ._scheduled .setdefault (spider , []).append (requests )
56
-
57
- def _schedule (self , spider , batches = ()):
58
- requests = chain .from_iterable (batches ) \
59
- if batches else spider .start_requests ()
60
- return self .engine .open_spider (spider , requests )
57
+ self ._start_requests = lambda : requests
61
58
62
59
def _spider_closed (self , spider = None ):
63
60
if not self .engine .open_spiders :
@@ -66,47 +63,40 @@ def _spider_closed(self, spider=None):
66
63
@defer .inlineCallbacks
67
64
def start (self ):
68
65
yield defer .maybeDeferred (self .configure )
69
-
70
- for spider , batches in self ._scheduled .iteritems ():
71
- yield self ._schedule (spider , batches )
72
-
66
+ if self ._spider :
67
+ yield self .engine .open_spider (self ._spider , self ._start_requests ())
73
68
yield defer .maybeDeferred (self .engine .start )
74
69
75
70
@defer .inlineCallbacks
76
71
def stop (self ):
77
- if self .engine .running :
72
+ if self .configured and self . engine .running :
78
73
yield defer .maybeDeferred (self .engine .stop )
79
74
80
75
81
- class ProcessMixin (object ):
82
- """ Mixin which provides automatic control of the Twisted reactor and
83
- installs some convenient signals for shutting it down
84
- """
76
+ class CrawlerProcess (object ):
77
+ """ A class to run multiple scrapy crawlers in a process sequentially"""
85
78
86
- def __init__ (self , * a , ** kw ):
79
+ def __init__ (self , settings ):
87
80
install_shutdown_handlers (self ._signal_shutdown )
81
+ self .settings = settings
82
+ self .crawlers = {}
83
+ self .stopping = False
84
+
85
+ def create_crawler (self , name = None ):
86
+ if name not in self .crawlers :
87
+ self .crawlers [name ] = Crawler (self .settings )
88
+
89
+ return self .crawlers [name ]
88
90
89
91
def start (self ):
90
92
if self .start_crawling ():
91
93
self .start_reactor ()
92
94
93
- def start_reactor (self ):
94
- if self .settings .getbool ('DNSCACHE_ENABLED' ):
95
- reactor .installResolver (CachingThreadedResolver (reactor ))
96
- reactor .addSystemEventTrigger ('before' , 'shutdown' , self .stop )
97
- reactor .run (installSignalHandlers = False ) # blocking call
98
-
99
- def start_crawling (self ):
100
- raise NotImplementedError
101
-
95
+ @defer .inlineCallbacks
102
96
def stop (self ):
103
- raise NotImplementedError
104
-
105
- def stop_reactor (self , _ = None ):
106
- try :
107
- reactor .stop ()
108
- except RuntimeError : # raised if already stopped or in shutdown stage
109
- pass
97
+ self .stopping = True
98
+ for crawler in self .crawlers .itervalues ():
99
+ yield crawler .stop ()
110
100
111
101
def _signal_shutdown (self , signum , _ ):
112
102
install_shutdown_handlers (self ._signal_kill )
@@ -120,27 +110,26 @@ def _signal_kill(self, signum, _):
120
110
signame = signal_names [signum ]
121
111
log .msg (format = 'Received %(signame)s twice, forcing unclean shutdown' ,
122
112
level = log .INFO , signame = signame )
123
- reactor .callFromThread (self .stop_reactor )
124
-
125
-
126
- class CrawlerProcess (ProcessMixin ):
127
- """ A class to run multiple scrapy crawlers in a process sequentially
128
- """
129
-
130
- def __init__ (self , settings ):
131
- super (CrawlerProcess , self ).__init__ (settings )
132
-
133
- self .settings = settings
134
- self .crawlers = {}
135
- self .stopping = False
136
-
137
- def create_crawler (self , name = None ):
138
- if name not in self .crawlers :
139
- self .crawlers [name ] = Crawler (self .settings )
113
+ reactor .callFromThread (self ._stop_reactor )
114
+
115
+ # ------------------------------------------------------------------------#
116
+ # The following public methods can't be considered stable and may change at
117
+ # any moment.
118
+ #
119
+ # start_crawling and start_reactor are called from scrapy.commands.shell
120
+ # They are splitted because reactor is started on a different thread than IPython shell.
121
+ #
122
+ def start_crawling (self ):
123
+ log .scrapy_info (self .settings )
124
+ return self ._start_crawler () is not None
140
125
141
- return self .crawlers [name ]
126
+ def start_reactor (self ):
127
+ if self .settings .getbool ('DNSCACHE_ENABLED' ):
128
+ reactor .installResolver (CachingThreadedResolver (reactor ))
129
+ reactor .addSystemEventTrigger ('before' , 'shutdown' , self .stop )
130
+ reactor .run (installSignalHandlers = False ) # blocking call
142
131
143
- def start_crawler (self ):
132
+ def _start_crawler (self ):
144
133
if self .crawlers and not self .stopping :
145
134
name , crawler = self .crawlers .popitem ()
146
135
@@ -151,23 +140,17 @@ def start_crawler(self):
151
140
if sflo :
152
141
crawler .signals .connect (sflo .stop , signals .engine_stopped )
153
142
154
- crawler .signals .connect (self .check_done , signals .engine_stopped )
143
+ crawler .signals .connect (self ._check_done , signals .engine_stopped )
155
144
crawler .start ()
156
145
157
146
return name , crawler
158
147
159
- def check_done (self , ** kwargs ):
160
- if not self .start_crawler ():
161
- self .stop_reactor ()
162
-
163
- def start_crawling (self ):
164
- log .scrapy_info (self .settings )
165
- return self .start_crawler () is not None
166
-
167
- @defer .inlineCallbacks
168
- def stop (self ):
169
- self .stopping = True
148
+ def _check_done (self , ** kwargs ):
149
+ if not self ._start_crawler ():
150
+ self ._stop_reactor ()
170
151
171
- for crawler in self .crawlers .itervalues ():
172
- if crawler .configured :
173
- yield crawler .stop ()
152
+ def _stop_reactor (self , _ = None ):
153
+ try :
154
+ reactor .stop ()
155
+ except RuntimeError : # raised if already stopped or in shutdown stage
156
+ pass
0 commit comments