author: 专注Python实战,分享爬虫与数据分析干货
title: Python爬虫实战⑮|爬虫效率提升,多线程与异步爬虫
update: 2026-04-26
tags: Python,爬虫,多线程,asyncio,异步爬虫,aiohttp,协程,并发
作者:专注Python实战,分享爬虫与数据分析干货
更新时间:2026年4月
适合人群:有Python基础、想提升爬虫速度的开发者
前言:requests为什么慢?
用requests写爬虫,每个请求都是"等待-下载-处理"串行执行:
请求1:|----下载20秒----|
请求2: |----下载20秒----|
请求3: |----下载20秒----|
10个请求:总耗时 = 10 × 20秒 = 200秒
异步爬虫 = 同时发送多个请求,效率翻10倍!
请求1:|----下载20秒----|
请求2:|----下载20秒----|
请求3:|----下载20秒----|
10个请求:总耗时 = 20秒(并发执行)
一、多线程爬虫
1.1 threading基础
import threading
import requests
import time
def fetch_page(url):
"""抓取单个页面"""
response = requests.get(url, timeout=10)
print(f"完成: {url[:30]} - {response.status_code}")
return response.text
# 串行抓取
urls = [f"https://example.com/page{i}" for i in range(1, 11)]
start = time.time()
for url in urls:
fetch_page(url)
print(f"串行耗时: {time.time() - start:.2f}秒")
# 多线程抓取
start = time.time()
threads = []
for url in urls:
t = threading.Thread(target=fetch_page, args=(url,))
t.start()
threads.append(t)
for t in threads:
t.join() # 等待所有线程完成
print(f"多线程耗时: {time.time() - start:.2f}秒")
1.2 线程池爬虫
import threading
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
def fetch_page(url, timeout=10):
"""抓取页面"""
try:
response = requests.get(url, timeout=timeout)
return {"url": url, "status": response.status_code, "content": response.text[:100]}
except Exception as e:
return {"url": url, "status": "error", "error": str(e)}
urls = [f"https://httpbin.org/delay/{i % 5}" for i in range(1, 21)]
# 线程池:同时运行5个线程
start = time.time()
results = []
with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(fetch_page, url): url for url in urls}
for future in as_completed(futures):
result = future.result()
results.append(result)
print(f" 完成: {result['url']}")
elapsed = time.time() - start
print(f"\n20个URL,5线程并发,耗时: {elapsed:.2f}秒")
print(f"平均每个: {elapsed/20:.2f}秒(理论上每个5秒×20/5=20秒)")
1.3 完整多线程爬虫
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import time
import csv
class MultiThreadCrawler:
"""多线程爬虫"""
def __init__(self, max_workers=5, delay=1):
self.max_workers = max_workers
self.delay = delay
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
self.results = []
def fetch(self, url):
"""抓取单个页面"""
try:
response = self.session.get(url, timeout=15)
response.raise_for_status()
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
return {"url": url, "status": 200, "soup": soup}
except Exception as e:
return {"url": url, "status": "error", "error": str(e)}
def parse(self, soup):
"""解析页面数据"""
items = []
for item in soup.select(".article-item"):
title = item.select_one(".title")
price = item.select_one(".price")
items.append({
"标题": title.text.strip() if title else "",
"价格": price.text.strip() if price else "",
})
return items
def crawl(self, urls, callback=None):
"""并发爬取多个URL"""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = {executor.submit(self.fetch, url): url for url in urls}
for future in as_completed(futures):
result = future.result()
if result["status"] == 200:
items = self.parse(result["soup"])
self.results.extend(items)
if callback:
callback(items)
print(f" ✓ {result['url']} - {len(items)}条数据")
else:
print(f" ✗ {result['url']} - {result.get('error', '未知错误')}")
time.sleep(self.delay) # 控制总请求频率
return self.results
def save(self, filename="results.csv"):
"""保存结果"""
if not self.results:
return
with open(filename, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=self.results[0].keys())
writer.writeheader()
writer.writerows(self.results)
print(f"已保存 {len(self.results)} 条数据到 {filename}")
# 使用
crawler = MultiThreadCrawler(max_workers=5, delay=0.5)
urls = [f"https://example.com/products?page={i}" for i in range(1, 21)]
results = crawler.crawl(urls)
crawler.save("products.csv")
二、asyncio异步爬虫(推荐)
2.1 asyncio基础概念
import asyncio
async def say_hello():
"""异步函数"""
print("Hello")
await asyncio.sleep(1) # 异步等待,不会阻塞其他任务
print("World")
async def main():
# 并发执行3个异步任务
await asyncio.gather(
say_hello(),
say_hello(),
say_hello(),
)
asyncio.run(main())
# 输出:
# Hello
# Hello
# Hello
# (等1秒)
# World
# World
# World
2.2 aiohttp异步HTTP客户端
pip install aiohttp -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install aiofiles -i https://pypi.tuna.tsinghua.edu.cn/simple
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import time
async def fetch(session, url):
"""异步请求单个页面"""
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
text = await response.text()
return {"url": url, "status": response.status, "text": text}
except asyncio.TimeoutError:
return {"url": url, "status": "timeout", "text": ""}
except Exception as e:
return {"url": url, "status": "error", "text": "", "error": str(e)}
async def parse(html):
"""解析HTML"""
soup = BeautifulSoup(html, "html.parser")
titles = soup.select(".article-title")
return [t.text.strip() for t in titles]
async def main():
urls = [f"https://example.com/page{i}" for i in range(1, 21)]
# 创建Session(复用连接)
async with aiohttp.ClientSession() as session:
# 并发请求所有URL
tasks = [fetch(session, url) for url in urls]
results = await asyncio.gather(*tasks)
# 解析结果
all_titles = []
for result in results:
if result["status"] == 200:
titles = await parse(result["text"])
all_titles.extend(titles)
print(f"✓ {result['url']}: {len(titles)}条")
else:
print(f"✗ {result['url']}: {result['status']}")
print(f"\n共获取 {len(all_titles)} 条标题")
# 测量时间
start = time.time()
asyncio.run(main())
print(f"总耗时: {time.time() - start:.2f}秒")
2.3 带并发控制的异步爬虫
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import time
import csv
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class AsyncCrawler:
"""异步爬虫类"""
max_concurrent: int = 10 # 最大并发数
delay: float = 0.5 # 请求间隔
async def fetch(self, session, url, semaphore):
"""带并发控制的异步请求"""
async with semaphore: # 限制同时请求数
await asyncio.sleep(self.delay) # 请求间隔
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
text = await response.text()
return {"url": url, "status": response.status, "text": text}
except Exception as e:
return {"url": url, "status": "error", "error": str(e)}
async def parse(self, html: str) -> List[dict]:
"""解析页面"""
soup = BeautifulSoup(html, "html.parser")
items = []
for item in soup.select(".product-item"):
title = item.select_one(".title")
price = item.select_one(".price")
rating = item.select_one(".rating")
items.append({
"标题": title.text.strip() if title else "",
"价格": price.text.strip() if price else "",
"评分": rating.text.strip() if rating else "",
})
return items
async def crawl(self, urls: List[str]) -> List[dict]:
"""并发爬取"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async with aiohttp.ClientSession() as session:
tasks = [self.fetch(session, url, semaphore) for url in urls]
results = await asyncio.gather(*tasks)
# 解析所有结果
all_items = []
for result in results:
if result["status"] == 200:
items = await self.parse(result["text"])
all_items.extend(items)
print(f"✓ {result['url']}: {len(items)}条")
else:
print(f"✗ {result['url']}: {result.get('error', '未知错误')}")
return all_items
def save(self, items: List[dict], filename: str):
"""保存结果"""
if not items:
return
with open(filename, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=items[0].keys())
writer.writeheader()
writer.writerows(items)
print(f"已保存 {len(items)} 条数据")
async def main():
crawler = AsyncCrawler(max_concurrent=10, delay=0.3)
urls = [f"https://example.com/products?page={i}" for i in range(1, 51)]
print(f"开始爬取 {len(urls)} 个页面,并发数={crawler.max_concurrent}")
start = time.time()
items = await crawler.crawl(urls)
elapsed = time.time() - start
print(f"\n总耗时: {elapsed:.2f}秒")
print(f"平均每页: {elapsed/len(urls):.2f}秒")
print(f"理论串行耗时: {elapsed/len(urls) * len(urls):.0f}秒")
print(f"效率提升: {(elapsed/len(urls) * len(urls))/elapsed:.1f}倍")
crawler.save(items, "async_products.csv")
asyncio.run(main())
三、性能对比
import time
import requests
import asyncio
import aiohttp
def benchmark_sequential(urls):
"""串行"""
start = time.time()
for url in urls:
requests.get(url, timeout=10)
return time.time() - start
async def benchmark_async(urls):
"""异步"""
start = time.time()
async with aiohttp.ClientSession() as session:
tasks = [session.get(url, timeout=aiohttp.ClientTimeout(total=10)) for url in urls]
await asyncio.gather(*tasks)
return time.time() - start
urls = ["https://httpbin.org/delay/1" for _ in range(10)]
t1 = benchmark_sequential(urls)
t2 = asyncio.run(benchmark_async(urls))
print(f"串行: {t1:.1f}秒")
print(f"异步: {t2:.1f}秒")
print(f"速度提升: {t1/t2:.1f}倍")
四、知识卡
| 概念 | 说明 |
|---|---|
| threading.Thread | 创建线程 |
| ThreadPoolExecutor | 线程池,并发执行 |
| as_completed | 任务完成时取结果 |
| asyncio | Python异步编程库 |
| async/await | 定义/调用异步函数 |
| aiohttp | 异步HTTP客户端 |
| asyncio.Semaphore | 并发数限制 |
| asyncio.gather | 并发执行多个任务 |
| ClientSession | 异步Session,复用连接 |
五、课后作业
必做题:
- 用ThreadPoolExecutor实现多线程爬虫
- 用aiohttp实现异步爬虫
- 对比串行、线程池、异步三种方式的性能
选做题:
- 实现带断点续传功能的异步爬虫
- 用asyncio+aiohttp爬取真实网站数据
有问题欢迎评论区留言,大家一起讨论!
标签:Python | 多线程 | asyncio | 异步爬虫 | aiohttp | 协程 | 并发
5397

被折叠的 条评论
为什么被折叠?



