Skip to content

Commit caeeb09

Browse files
committed
POC for asyncio+aiohttp
1 parent 280eab2 commit caeeb09

File tree

5 files changed

+73
-5
lines changed

5 files changed

+73
-5
lines changed

requirements-py3.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ cssselect>=0.9
55
queuelib>=1.1.1
66
w3lib>=1.8.0
77
service_identity
8+
https://github.com/dangra/txtulip/archive/0a30192.zip#egg=txtulip

scrapy/_monkeypatches.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import sys
2+
import six
23
from six.moves import copyreg
34

45
if sys.version_info[0] == 2:
@@ -14,6 +15,10 @@
1415
from urlparse import uses_query
1516
uses_query.append('s3')
1617

18+
# Enable asyncio reactor
19+
if six.PY3:
20+
from txtulip.reactor import install
21+
install()
1722

1823
# Undo what Twisted's perspective broker adds to pickle register
1924
# to prevent bugs like Twisted#7989 while serializing requests
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""Download handlers for http and https schemes"""
2+
import logging
3+
import asyncio
4+
5+
import aiohttp
6+
from twisted.internet import defer
7+
8+
from scrapy.http import Headers
9+
from scrapy.responsetypes import responsetypes
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class HTTPDownloadHandler(object):
15+
16+
def __init__(self, settings):
17+
self.settings = settings
18+
19+
def download_request(self, request, spider):
20+
"""Return a deferred for the HTTP download"""
21+
headers=list((k.decode('latin1'), v.decode('latin1'))
22+
for k, vs in request.headers.items()
23+
for v in vs)
24+
25+
dfd = _force_deferred(
26+
aiohttp.request(
27+
method=request.method,
28+
url=request.url,
29+
data=request.body,
30+
allow_redirects=False,
31+
headers=headers,
32+
))
33+
34+
def _on_response(aioresponse):
35+
return _force_deferred(aioresponse.read()).addCallback(
36+
_on_body, aioresponse=aioresponse)
37+
38+
def _on_body(body, aioresponse):
39+
url = request.url
40+
status = aioresponse.status
41+
headers = Headers(
42+
(k.encode('latin1'), [v.encode('latin1')])
43+
for k, v in aioresponse.headers.items()
44+
)
45+
respcls = responsetypes.from_args(headers=headers, url=url)
46+
return respcls(url=url, status=status, headers=headers, body=body,
47+
flags=[])
48+
49+
return dfd.addCallback(_on_response)
50+
51+
52+
def _force_deferred(coro):
53+
dfd = defer.Deferred().addCallback(lambda f: f.result())
54+
future = asyncio.async(coro)
55+
future.add_done_callback(dfd.callback)
56+
return dfd

scrapy/downloadermiddlewares/redirect.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,24 +60,30 @@ def process_response(self, request, response, spider):
6060

6161
if request.method == 'HEAD':
6262
if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
63-
redirected_url = urljoin(request.url, response.headers['location'])
63+
redirected_url = self._urljoin_location(request, response)
6464
redirected = request.replace(url=redirected_url)
6565
return self._redirect(redirected, request, spider, response.status)
6666
else:
6767
return response
6868

6969
if response.status in [302, 303] and 'Location' in response.headers:
70-
redirected_url = urljoin(request.url, response.headers['location'])
70+
redirected_url = self._urljoin_location(request, response)
7171
redirected = self._redirect_request_using_get(request, redirected_url)
7272
return self._redirect(redirected, request, spider, response.status)
7373

7474
if response.status in [301, 307] and 'Location' in response.headers:
75-
redirected_url = urljoin(request.url, response.headers['location'])
75+
redirected_url = self._urljoin_location(request, response)
7676
redirected = request.replace(url=redirected_url)
7777
return self._redirect(redirected, request, spider, response.status)
7878

7979
return response
8080

81+
def _urljoin_location(self, request, response):
82+
return urljoin(
83+
request.url,
84+
response.headers['location'].decode('latin1')
85+
)
86+
8187

8288
class MetaRefreshMiddleware(BaseRedirectMiddleware):
8389

scrapy/settings/default_settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@
6666
DOWNLOAD_HANDLERS = {}
6767
DOWNLOAD_HANDLERS_BASE = {
6868
'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
69-
'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
70-
'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
69+
'http': 'scrapy.core.downloader.handlers.aiohttp.HTTPDownloadHandler',
70+
'https': 'scrapy.core.downloader.handlers.aiohttp.HTTPDownloadHandler',
7171
's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
7272
'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler',
7373
}

0 commit comments

Comments
 (0)