Skip to content

Commit c09d414

Browse files
alexcepoidangra
authored andcommitted
command with multiple crawlers, fix check command
1 parent 9c15851 commit c09d414

File tree

5 files changed

+116
-36
lines changed

5 files changed

+116
-36
lines changed

scrapy/cmdline.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pkg_resources
66

77
import scrapy
8-
from scrapy.crawler import CrawlerProcess
8+
from scrapy.crawler import CrawlerProcess, MultiCrawlerProcess
99
from scrapy.xlib import lsprofcalltree
1010
from scrapy.command import ScrapyCommand
1111
from scrapy.exceptions import UsageError
@@ -81,7 +81,7 @@ def _print_commands(settings, inproject):
8181
def _print_unknown_command(settings, cmdname, inproject):
8282
_print_header(settings, inproject)
8383
print "Unknown command: %s\n" % cmdname
84-
print 'Use "scrapy" to see available commands'
84+
print 'Use "scrapy" to see available commands'
8585

8686
def _run_print_help(parser, func, *a, **kw):
8787
try:
@@ -117,8 +117,6 @@ def execute(argv=None, settings=None):
117117
conf.settings = settings
118118
# ------------------------------------------------------------------
119119

120-
crawler = CrawlerProcess(settings)
121-
crawler.install()
122120
inproject = inside_project()
123121
cmds = _get_commands_dict(settings, inproject)
124122
cmdname = _pop_command_name(argv)
@@ -139,7 +137,15 @@ def execute(argv=None, settings=None):
139137
cmd.add_options(parser)
140138
opts, args = parser.parse_args(args=argv[1:])
141139
_run_print_help(parser, cmd.process_options, args, opts)
142-
cmd.set_crawler(crawler)
140+
141+
if cmd.multi_crawlers:
142+
process = MultiCrawlerProcess(settings)
143+
cmd.process = process
144+
else:
145+
process = CrawlerProcess(settings)
146+
process.install()
147+
cmd.set_crawler(process)
148+
143149
_run_print_help(parser, _run_command, cmd, args, opts)
144150
sys.exit(cmd.exitcode)
145151

scrapy/command.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
class ScrapyCommand(object):
1414

1515
requires_project = False
16+
multi_crawlers = False
1617

1718
# default settings to be used for this command instead of global defaults
1819
default_settings = {}
@@ -21,18 +22,17 @@ class ScrapyCommand(object):
2122

2223
def __init__(self):
2324
self.settings = None # set in scrapy.cmdline
24-
self.configured = False
2525

2626
def set_crawler(self, crawler):
2727
assert not hasattr(self, '_crawler'), "crawler already set"
2828
self._crawler = crawler
2929

3030
@property
3131
def crawler(self):
32-
if not self.configured:
32+
if not self.multi_crawlers and not self._crawler.configured:
3333
log.start_from_crawler(self._crawler)
3434
self._crawler.configure()
35-
self.configured = True
35+
3636
return self._crawler
3737

3838
def syntax(self):
@@ -83,7 +83,7 @@ def add_options(self, parser):
8383
help="set/override setting (may be repeated)")
8484
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
8585
parser.add_option_group(group)
86-
86+
8787
def process_options(self, args, opts):
8888
try:
8989
self.settings.overrides.update(arglist_to_dict(opts.set))

scrapy/commands/check.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@
22
from functools import wraps
33
from unittest import TextTestRunner
44

5-
from scrapy import signals
65
from scrapy.command import ScrapyCommand
76
from scrapy.contracts import ContractsManager
87
from scrapy.utils.misc import load_object
98
from scrapy.utils.spider import iterate_spider_output
109
from scrapy.utils.conf import build_component_list
11-
from scrapy.xlib.pydispatch import dispatcher
1210

1311

1412
def _generate(cb):
@@ -22,7 +20,8 @@ def wrapper(response):
2220

2321
class Command(ScrapyCommand):
2422
requires_project = True
25-
default_settings = {'LOG_ENABLED': False}
23+
multi_crawlers = True
24+
default_settings = {'LOG_ENABLED': True}
2625

2726
def syntax(self):
2827
return "[options] <spider>"
@@ -48,17 +47,20 @@ def run(self, args, opts):
4847

4948
# contract requests
5049
contract_reqs = defaultdict(list)
51-
self.crawler.engine.has_capacity = lambda: True
5250

53-
for spider in args or self.crawler.spiders.list():
54-
spider = self.crawler.spiders.create(spider)
51+
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
52+
spiders = spman_cls.from_settings(self.settings)
53+
54+
for spider in args or spiders.list():
55+
spider = spiders.create(spider)
5556
requests = self.get_requests(spider)
5657

5758
if opts.list:
5859
for req in requests:
5960
contract_reqs[spider.name].append(req.callback.__name__)
60-
else:
61-
self.crawler.crawl(spider, requests)
61+
elif requests:
62+
crawler = self.process.create_crawler(spider.name)
63+
crawler.crawl(spider, requests)
6264

6365
# start checks
6466
if opts.list:
@@ -67,9 +69,8 @@ def run(self, args, opts):
6769
for method in sorted(methods):
6870
print ' * %s' % method
6971
else:
70-
dispatcher.connect(self.results.printErrors,
71-
signals.engine_stopped)
72-
self.crawler.start()
72+
self.process.start()
73+
self.results.printErrors()
7374

7475
def get_requests(self, spider):
7576
requests = []

scrapy/crawler.py

Lines changed: 83 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ def __init__(self, settings):
2020
self.signals = SignalManager(self)
2121
self.stats = load_object(settings['STATS_CLASS'])(self)
2222

23+
self.scheduled = {}
24+
2325
def install(self):
2426
import scrapy.project
2527
assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
@@ -47,7 +49,12 @@ def crawl(self, spider, requests=None):
4749
spider.set_crawler(self)
4850
if requests is None:
4951
requests = spider.start_requests()
50-
return self.engine.open_spider(spider, requests)
52+
53+
if self.configured and self.engine.running:
54+
assert not self.scheduled
55+
return self.engine.open_spider(spider, requests)
56+
else:
57+
self.scheduled.setdefault(spider, []).extend(requests)
5158

5259
def _spider_closed(self, spider=None):
5360
if not self.engine.open_spiders:
@@ -56,6 +63,10 @@ def _spider_closed(self, spider=None):
5663
@defer.inlineCallbacks
5764
def start(self):
5865
yield defer.maybeDeferred(self.configure)
66+
67+
for spider, requests in self.scheduled.iteritems():
68+
yield self.engine.open_spider(spider, requests)
69+
5970
yield defer.maybeDeferred(self.engine.start)
6071

6172
@defer.inlineCallbacks
@@ -64,33 +75,27 @@ def stop(self):
6475
yield defer.maybeDeferred(self.engine.stop)
6576

6677

67-
class CrawlerProcess(Crawler):
68-
"""A class to run a single Scrapy crawler in a process. It provides
69-
automatic control of the Twisted reactor and installs some convenient
70-
signals for shutting down the crawl.
78+
class ProcessMixin(object):
79+
""" Mixin which provides automatic control of the Twisted reactor and
80+
installs some convenient signals for shutting it down
7181
"""
7282

7383
def __init__(self, *a, **kw):
74-
super(CrawlerProcess, self).__init__(*a, **kw)
75-
self.signals.connect(self.stop, signals.engine_stopped)
7684
install_shutdown_handlers(self._signal_shutdown)
7785

7886
def start(self):
79-
super(CrawlerProcess, self).start()
8087
if self.settings.getbool('DNSCACHE_ENABLED'):
8188
reactor.installResolver(CachingThreadedResolver(reactor))
8289
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
83-
reactor.run(installSignalHandlers=False) # blocking call
90+
reactor.run(installSignalHandlers=False) # blocking call
8491

8592
def stop(self):
86-
d = super(CrawlerProcess, self).stop()
87-
d.addBoth(self._stop_reactor)
88-
return d
93+
raise NotImplementedError
8994

9095
def _stop_reactor(self, _=None):
9196
try:
9297
reactor.stop()
93-
except RuntimeError: # raised if already stopped or in shutdown stage
98+
except RuntimeError: # raised if already stopped or in shutdown stage
9499
pass
95100

96101
def _signal_shutdown(self, signum, _):
@@ -106,3 +111,68 @@ def _signal_kill(self, signum, _):
106111
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
107112
level=log.INFO, signame=signame)
108113
reactor.callFromThread(self._stop_reactor)
114+
115+
116+
class CrawlerProcess(Crawler, ProcessMixin):
117+
""" A class to run a single Scrapy crawler in a process
118+
"""
119+
120+
def __init__(self, *a, **kw):
121+
Crawler.__init__(self, *a, **kw)
122+
ProcessMixin.__init__(self, *a, **kw)
123+
self.signals.connect(self.stop, signals.engine_stopped)
124+
125+
def start(self):
126+
Crawler.start(self)
127+
ProcessMixin.start(self)
128+
129+
def stop(self):
130+
d = Crawler.stop(self)
131+
d.addBoth(self._stop_reactor)
132+
return d
133+
134+
135+
class MultiCrawlerProcess(ProcessMixin):
136+
""" A class to run multiple scrapy crawlers in a process sequentially
137+
"""
138+
139+
def __init__(self, settings):
140+
super(MultiCrawlerProcess, self).__init__(settings)
141+
142+
self.settings = settings
143+
self.crawlers = {}
144+
self.stopping = False
145+
146+
def create_crawler(self, name):
147+
if name not in self.crawlers:
148+
self.crawlers[name] = Crawler(self.settings)
149+
150+
return self.crawlers[name]
151+
152+
def start_crawler(self):
153+
name, crawler = self.crawlers.popitem()
154+
155+
crawler.sflo = log.start_from_crawler(crawler)
156+
crawler.signals.connect(crawler.sflo.stop, signals.engine_stopped)
157+
crawler.signals.connect(self.check_done, signals.engine_stopped)
158+
crawler.start()
159+
160+
return name, crawler
161+
162+
def check_done(self, **kwargs):
163+
if self.crawlers and not self.stopping:
164+
self.start_crawler()
165+
else:
166+
self._stop_reactor()
167+
168+
def start(self):
169+
self.start_crawler()
170+
super(MultiCrawlerProcess, self).start()
171+
172+
@defer.inlineCallbacks
173+
def stop(self):
174+
self.stopping = True
175+
176+
for crawler in self.crawlers.itervalues():
177+
if crawler.configured:
178+
yield crawler.stop()

scrapy/log.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""
1+
"""
22
Scrapy logging facility
33
44
See documentation in docs/topics/logging.rst
@@ -11,7 +11,7 @@
1111

1212
import scrapy
1313
from scrapy.utils.python import unicode_to_str
14-
14+
1515
# Logging levels
1616
DEBUG = logging.DEBUG
1717
INFO = logging.INFO
@@ -138,9 +138,12 @@ def start_from_crawler(crawler):
138138
if not settings.getbool('LOG_ENABLED'):
139139
return
140140

141-
start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
141+
sflo = start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
142142
settings['LOG_ENCODING'], crawler)
143+
143144
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \
144145
settings['BOT_NAME']))
145146
msg("Optional features available: %s" % ", ".join(scrapy.optional_features),
146147
level=DEBUG)
148+
149+
return sflo

0 commit comments

Comments
 (0)