1
+ from __future__ import absolute_import
1
2
import random
2
3
import warnings
3
4
from time import time
5
+ from datetime import datetime
4
6
from collections import deque
5
7
6
8
from twisted .internet import reactor , defer , task
7
9
8
10
from scrapy .utils .defer import mustbe_deferred
9
11
from scrapy .utils .httpobj import urlparse_cached
10
12
from scrapy .resolver import dnscache
11
- from scrapy .exceptions import ScrapyDeprecationWarning
12
13
from scrapy import signals
13
14
from .middleware import DownloaderMiddlewareManager
14
15
from .handlers import DownloadHandlers
17
18
class Slot (object ):
18
19
"""Downloader slot"""
19
20
20
- def __init__ (self , concurrency , delay , settings ):
21
+ def __init__ (self , concurrency , delay , randomize_delay ):
21
22
self .concurrency = concurrency
22
23
self .delay = delay
23
- self .randomize_delay = settings .getbool ('RANDOMIZE_DOWNLOAD_DELAY' )
24
+ self .randomize_delay = randomize_delay
25
+
24
26
self .active = set ()
25
27
self .queue = deque ()
26
28
self .transferring = set ()
@@ -39,6 +41,21 @@ def close(self):
39
41
if self .latercall and self .latercall .active ():
40
42
self .latercall .cancel ()
41
43
44
+ def __repr__ (self ):
45
+ cls_name = self .__class__ .__name__
46
+ return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
47
+ cls_name , self .concurrency , self .delay , self .randomize_delay )
48
+
49
+ def __str__ (self ):
50
+ return (
51
+ "<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
52
+ "len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
53
+ self .concurrency , self .delay , self .randomize_delay ,
54
+ len (self .active ), len (self .queue ), len (self .transferring ),
55
+ datetime .fromtimestamp (self .lastseen ).isoformat ()
56
+ )
57
+ )
58
+
42
59
43
60
def _get_concurrency_delay (concurrency , spider , settings ):
44
61
delay = settings .getfloat ('DOWNLOAD_DELAY' )
@@ -66,6 +83,7 @@ def __init__(self, crawler):
66
83
self .total_concurrency = self .settings .getint ('CONCURRENT_REQUESTS' )
67
84
self .domain_concurrency = self .settings .getint ('CONCURRENT_REQUESTS_PER_DOMAIN' )
68
85
self .ip_concurrency = self .settings .getint ('CONCURRENT_REQUESTS_PER_IP' )
86
+ self .randomize_delay = self .settings .getbool ('RANDOMIZE_DOWNLOAD_DELAY' )
69
87
self .middleware = DownloaderMiddlewareManager .from_crawler (crawler )
70
88
self ._slot_gc_loop = task .LoopingCall (self ._slot_gc )
71
89
self ._slot_gc_loop .start (60 )
@@ -87,7 +105,7 @@ def _get_slot(self, request, spider):
87
105
if key not in self .slots :
88
106
conc = self .ip_concurrency if self .ip_concurrency else self .domain_concurrency
89
107
conc , delay = _get_concurrency_delay (conc , spider , self .settings )
90
- self .slots [key ] = Slot (conc , delay , self .settings )
108
+ self .slots [key ] = Slot (conc , delay , self .randomize_delay )
91
109
92
110
return key , self .slots [key ]
93
111
0 commit comments