Python网络爬虫实战:从Requests到Scrapy的完整指南
引言
网络爬虫是数据采集和分析的重要工具。作为从Python转向Rust的后端开发者,我发现Python的爬虫生态非常成熟,从简单的Requests到强大的Scrapy框架,能够满足各种爬虫需求。本文将从实战角度出发,深入探讨Python网络爬虫的最佳实践,帮助你构建高效、稳定的爬虫系统。
一、网络爬虫概述
1.1 爬虫类型
| 类型 | 特点 | 适用场景 |
|---|---|---|
| 静态爬虫 | 爬取静态HTML页面 | 简单网站、数据采集 |
| 动态爬虫 | 处理JavaScript渲染 | 现代SPA应用 |
| 增量爬虫 | 定期更新数据 | 新闻、博客监控 |
| 分布式爬虫 | 多节点协作 | 大规模数据采集 |
1.2 爬虫架构
┌─────────────────────────────────────────────────────┐
│ 调度层 │
│ URL队列 → 调度器 → 请求分发 │
├─────────────────────────────────────────────────────┤
│ 抓取层 │
│ 请求模块 → 页面解析 → 数据提取 │
├─────────────────────────────────────────────────────┤
│ 存储层 │
│ 数据清洗 → 数据存储 → 数据备份 │
└─────────────────────────────────────────────────────┘
二、Requests基础爬虫
2.1 基本请求
import requests
url = 'https://example.com'
response = requests.get(url)
print(f"状态码: {response.status_code}")
print(f"响应头: {response.headers}")
print(f"响应内容: {response.text[:500]}")
2.2 请求参数
params = {'key1': 'value1', 'key2': 'value2'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://example.com'
}
response = requests.get(
'https://api.example.com/data',
params=params,
headers=headers,
timeout=10
)
2.3 会话管理
session = requests.Session()
session.headers.update({'User-Agent': 'MyBot/1.0'})
# 保持登录状态
session.post('https://example.com/login', data={'username': 'user', 'password': 'pass'})
# 后续请求自动携带cookie
response = session.get('https://example.com/dashboard')
三、BeautifulSoup解析
3.1 HTML解析
from bs4 import BeautifulSoup
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# 查找标签
title = soup.title.string
print(f"页面标题: {title}")
# 查找元素
links = soup.find_all('a', href=True)
for link in links[:5]:
print(f"链接: {link['href']} - {link.get_text()}")
# 使用CSS选择器
articles = soup.select('article.post')
for article in articles:
title = article.select_one('h2.title').get_text()
summary = article.select_one('p.summary').get_text()
print(f"{title}: {summary}")
3.2 数据提取实战
def extract_news_items(html):
soup = BeautifulSoup(html, 'html.parser')
news_items = []
for item in soup.select('div.news-item'):
title = item.select_one('h3').get_text(strip=True)
url = item.select_one('a')['href']
date = item.select_one('span.date').get_text(strip=True)
category = item.select_one('span.category').get_text(strip=True)
news_items.append({
'title': title,
'url': url,
'date': date,
'category': category
})
return news_items
四、Scrapy框架
4.1 创建项目
scrapy startproject my_spider
cd my_spider
scrapy genspider example example.com
4.2 编写爬虫
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.com']
start_urls = ['https://example.com/news']
def parse(self, response):
for article in response.css('article.post'):
yield {
'title': article.css('h2.title::text').get(),
'url': article.css('a::attr(href)').get(),
'summary': article.css('p.summary::text').get(),
'date': article.css('time::attr(datetime)').get()
}
# 分页处理
next_page = response.css('a.next-page::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
4.3 配置文件
# settings.py
USER_AGENT = 'MySpider/1.0 (+http://www.example.com)'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 2
CONCURRENT_REQUESTS = 8
ITEM_PIPELINES = {
'my_spider.pipelines.MySpiderPipeline': 300,
}
4.4 数据管道
class MySpiderPipeline:
def process_item(self, item, spider):
# 数据清洗
item['title'] = item['title'].strip()
item['summary'] = item['summary'].strip()
# 数据存储
self.store_item(item)
return item
def store_item(self, item):
# 存储到数据库或文件
pass
五、动态页面爬取
5.1 使用Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://example.com/dynamic-page')
# 等待元素加载
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.content'))
)
# 提取数据
content = element.text
print(content)
driver.quit()
5.2 使用Playwright
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto('https://example.com/dynamic-page')
# 等待网络空闲
page.wait_for_load_state('networkidle')
# 提取数据
items = page.query_selector_all('div.item')
for item in items:
title = item.query_selector('h3').inner_text()
print(title)
browser.close()
六、反爬策略
6.1 请求频率控制
import time
from random import randint
class RateLimiter:
def __init__(self, min_delay=1, max_delay=3):
self.min_delay = min_delay
self.max_delay = max_delay
def wait(self):
delay = randint(self.min_delay * 1000, self.max_delay * 1000) / 1000
time.sleep(delay)
rate_limiter = RateLimiter()
# 在请求之间等待
rate_limiter.wait()
response = requests.get(url)
6.2 User-Agent轮换
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
]
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers)
6.3 使用代理
proxies = {
'http': 'http://proxy-server:port',
'https': 'https://proxy-server:port'
}
response = requests.get(url, proxies=proxies)
七、实战:完整爬虫系统
7.1 项目结构
my_crawler/
├── crawler/
│ ├── __init__.py
│ ├── spiders/
│ │ ├── news_spider.py
│ │ └── product_spider.py
│ ├── pipelines/
│ │ └── database_pipeline.py
│ └── settings.py
├── data/
├── logs/
└── main.py
7.2 主程序
from scrapy.crawler import CrawlerProcess
from crawler.settings import Settings
from crawler.spiders.news_spider import NewsSpider
def main():
process = CrawlerProcess(settings=Settings())
process.crawl(NewsSpider)
process.start()
if __name__ == '__main__':
main()
7.3 数据库存储
import sqlite3
class DatabasePipeline:
def __init__(self):
self.conn = sqlite3.connect('data/crawler.db')
self.cursor = self.conn.cursor()
self.create_table()
def create_table(self):
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS news (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
url TEXT UNIQUE,
summary TEXT,
date TEXT,
category TEXT
)
''')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute('''
INSERT OR IGNORE INTO news
(title, url, summary, date, category)
VALUES (?, ?, ?, ?, ?)
''', (item['title'], item['url'], item['summary'],
item['date'], item['category']))
self.conn.commit()
except Exception as e:
spider.logger.error(f"存储失败: {e}")
return item
八、爬虫最佳实践
8.1 遵守robots.txt
# 检查robots.txt
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url(https://blog.csdn.net/'https://example.com/robots.txt')
rp.read()
if rp.can_fetch('MyBot', 'https://example.com/news'):
# 可以爬取
response = requests.get('https://example.com/news')
else:
print("该页面禁止爬取")
8.2 设置合理的请求头
headers = {
'User-Agent': 'MyCrawler/1.0 (+https://example.com/crawler)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
8.3 错误处理
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
# 可以选择重试或跳过
九、总结
Python的爬虫生态非常强大,从简单的Requests到专业的Scrapy框架,能够满足各种数据采集需求。作为后端开发者,掌握爬虫技能不仅能够帮助我们获取数据,还能为数据分析和机器学习提供数据支持。
关键要点:
- 选择合适的工具:根据需求选择Requests、BeautifulSoup、Scrapy或Playwright
- 遵守规则:尊重网站的robots.txt和使用条款
- 反爬应对:实现请求频率控制、User-Agent轮换、代理使用
- 数据存储:合理设计数据存储方案
- 错误处理:完善的异常处理机制
从Python转向Rust后,我发现Rust的reqwest库在性能方面有很大优势,适合构建高性能的爬虫系统。
延伸阅读
- Scrapy官方文档
- Requests官方文档
- BeautifulSoup教程
- Playwright官方指南
2356

被折叠的 条评论
为什么被折叠?



