author: 专注Python实战,分享爬虫与数据分析干货
title: Python爬虫实战⑩|数据存储方案,CSV/Excel/MySQL全搞定
update: 2026-04-26
tags: Python,爬虫,数据存储,CSV,Excel,MySQL,SQLite,json
作者:专注Python实战,分享爬虫与数据分析干货
更新时间:2026年4月
适合人群:已掌握爬虫抓取、想把数据存下来的开发者
前言:数据存哪?这是爬虫的最后一公里
抓到了数据,存在哪?
| 存储方式 | 适合场景 | 容量 |
|---|---|---|
| CSV | 小数据、Excel打开 | 百万行以内 |
| Excel | 需要格式化展示 | 100万行以内 |
| JSON | API数据、嵌套结构 | 灵活 |
| SQLite | 单机应用、离线查询 | TB级 |
| MySQL | 多人协作、线上服务 | TB级 |
今天一次性搞定所有存储方案!
一、CSV存储
1.1 基础写入
import csv
data = [
{"标题": "Python入门", "价格": 59.9, "评分": 9.2},
{"标题": "爬虫实战", "价格": 49.9, "评分": 9.0},
{"标题": "数据分析", "价格": 69.9, "评分": 8.8},
]
# 写入CSV(最基础)
with open("books.csv", "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=["标题", "价格", "评分"])
writer.writeheader()
writer.writerows(data)
print("写入完成")
1.2 追加写入
# 追加模式(不写表头)
new_data = [
{"标题": "机器学习", "价格": 79.9, "评分": 9.5},
]
with open("books.csv", "a", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=["标题", "价格", "评分"])
# 检查文件是否为空,决定是否写表头
import os
if os.path.getsize("books.csv") == 0:
writer.writeheader()
writer.writerows(new_data)
1.3 用Pandas写入(推荐)
import pandas as pd
df = pd.DataFrame(data)
# 写入CSV
df.to_csv("books.csv", index=False, encoding="utf-8-sig")
# 追加
df.to_csv("books.csv", mode="a", header=False, index=False, encoding="utf-8-sig")
二、Excel存储
2.1 openpyxl写入
pip install openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
wb = Workbook()
ws = wb.active
ws.title = "商品数据"
# 表头
headers = ["标题", "价格", "评分", "销量"]
ws.append(headers)
# 设置表头样式
header_font = Font(bold=True, color="FFFFFF", size=12)
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
for col in range(1, len(headers) + 1):
cell = ws.cell(row=1, column=col)
cell.font = header_font
cell.fill = header_fill
cell.alignment = Alignment(horizontal="center")
# 数据
products = [
["Python入门", 59.9, 9.2, 1200],
["爬虫实战", 49.9, 9.0, 800],
["数据分析", 69.9, 8.8, 600],
]
for product in products:
ws.append(product)
# 自动列宽
for col in ws.columns:
max_length = max(len(str(cell.value or "")) for cell in col)
ws.column_dimensions[col[0].column_letter].width = max_length + 4
wb.save("products.xlsx")
print("Excel写入完成")
2.2 用Pandas写入(推荐)
import pandas as pd
df = pd.DataFrame(products, columns=headers)
df.to_excel("products.xlsx", index=False, sheet_name="商品数据")
# 多sheet写入
with pd.ExcelWriter("products.xlsx", engine="openpyxl") as writer:
df.to_excel(writer, sheet_name="商品", index=False)
df2.to_excel(writer, sheet_name="订单", index=False)
df3.to_excel(writer, sheet_name="评论", index=False)
三、JSON存储
3.1 基础写入
import json
data = [
{"标题": "Python入门", "价格": 59.9, "评分": 9.2},
{"标题": "爬虫实战", "价格": 49.9, "评分": 9.0},
]
# 写入JSON
with open("books.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# 读取JSON
with open("books.json", "r", encoding="utf-8") as f:
loaded = json.load(f)
print(f"读取到 {len(loaded)} 条数据")
3.2 JSON Lines(大数据推荐)
# 写入JSONL(每行一条,适合流式写入)
with open("books.jsonl", "w", encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
# 读取JSONL
with open("books.jsonl", "r", encoding="utf-8") as f:
loaded = [json.loads(line) for line in f if line.strip()]
四、SQLite存储
4.1 基础操作
import sqlite3
# 创建/连接数据库
conn = sqlite3.connect("crawler.db")
cursor = conn.cursor()
# 创建表
cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
price REAL,
rating REAL,
url TEXT UNIQUE,
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 插入数据
cursor.execute("""
INSERT OR IGNORE INTO products (title, price, rating, url)
VALUES (?, ?, ?, ?)
""", ("Python入门", 59.9, 9.2, "https://example.com/1"))
# 批量插入
products = [
("爬虫实战", 49.9, 9.0, "https://example.com/2"),
("数据分析", 69.9, 8.8, "https://example.com/3"),
]
cursor.executemany("""
INSERT OR IGNORE INTO products (title, price, rating, url)
VALUES (?, ?, ?, ?)
""", products)
conn.commit()
# 查询
cursor.execute("SELECT * FROM products WHERE rating > 9.0")
results = cursor.fetchall()
for row in results:
print(row)
conn.close()
4.2 爬虫SQLite存储类
import sqlite3
from datetime import datetime
class CrawlerDB:
"""爬虫数据库管理"""
def __init__(self, db_path="crawler.db"):
self.conn = sqlite3.connect(db_path)
self.cursor = self.conn.cursor()
self._create_tables()
def _create_tables(self):
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE,
title TEXT,
content TEXT,
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
self.conn.commit()
def insert(self, url, title, content):
try:
self.cursor.execute("""
INSERT OR REPLACE INTO pages (url, title, content)
VALUES (?, ?, ?)
""", (url, title, content))
self.conn.commit()
return True
except Exception as e:
print(f"插入失败: {e}")
return False
def exists(self, url):
self.cursor.execute("SELECT 1 FROM pages WHERE url=?", (url,))
return self.cursor.fetchone() is not None
def get_unvisited(self, limit=100):
self.cursor.execute("""
SELECT url FROM pages WHERE content IS NULL LIMIT ?
""", (limit,))
return [row[0] for row in self.cursor.fetchall()]
def close(self):
self.conn.close()
五、MySQL存储
5.1 安装依赖
pip install pymysql -i https://pypi.tuna.tsinghua.edu.cn/simple
5.2 MySQL存储类
import pymysql
from datetime import datetime
class MySQLStorage:
"""MySQL存储"""
def __init__(self, host="localhost", port=3306,
user="root", password="", database="crawler_db"):
self.conn = pymysql.connect(
host=host, port=port, user=user,
password=password, database=database,
charset="utf8mb4", autocommit=False,
)
self.cursor = self.conn.cursor()
self._create_table()
def _create_table(self):
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(500) NOT NULL,
price DECIMAL(10,2),
rating DECIMAL(3,1),
url VARCHAR(1000) UNIQUE,
source VARCHAR(100),
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_rating (rating),
INDEX idx_source (source)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
""")
self.conn.commit()
def insert(self, item):
try:
self.cursor.execute("""
INSERT INTO products (title, price, rating, url, source)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
price=VALUES(price), rating=VALUES(rating)
""", (item["title"], item.get("price"), item.get("rating"),
item.get("url"), item.get("source")))
self.conn.commit()
return True
except Exception as e:
self.conn.rollback()
print(f"MySQL插入失败: {e}")
return False
def batch_insert(self, items):
"""批量插入"""
try:
data = [(i["title"], i.get("price"), i.get("rating"),
i.get("url"), i.get("source")) for i in items]
self.cursor.executemany("""
INSERT INTO products (title, price, rating, url, source)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
price=VALUES(price), rating=VALUES(rating)
""", data)
self.conn.commit()
print(f"批量插入 {len(items)} 条")
return True
except Exception as e:
self.conn.rollback()
print(f"批量插入失败: {e}")
return False
def query(self, sql, params=None):
self.cursor.execute(sql, params)
return self.cursor.fetchall()
def close(self):
self.cursor.close()
self.conn.close()
六、通用存储管理器
import csv
import json
import pandas as pd
from datetime import datetime
class DataStorage:
"""通用数据存储管理器"""
def __init__(self, filename, format="csv"):
self.filename = filename
self.format = format
self.data = []
def add(self, item):
"""添加一条数据"""
item["_crawl_time"] = datetime.now().isoformat()
self.data.append(item)
def add_many(self, items):
"""批量添加"""
for item in items:
self.add(item)
def save(self):
"""保存数据"""
if not self.data:
print("没有数据需要保存")
return
if self.format == "csv":
self._save_csv()
elif self.format == "json":
self._save_json()
elif self.format == "excel":
self._save_excel()
else:
raise ValueError(f"不支持的格式: {self.format}")
def _save_csv(self):
df = pd.DataFrame(self.data)
df.to_csv(self.filename, index=False, encoding="utf-8-sig")
print(f"已保存 {len(self.data)} 条到 {self.filename}")
def _save_json(self):
with open(self.filename, "w", encoding="utf-8") as f:
json.dump(self.data, f, ensure_ascii=False, indent=2)
print(f"已保存 {len(self.data)} 条到 {self.filename}")
def _save_excel(self):
df = pd.DataFrame(self.data)
df.to_excel(self.filename, index=False)
print(f"已保存 {len(self.data)} 条到 {self.filename}")
# 使用示例
storage = DataStorage("products.csv", format="csv")
storage.add({"标题": "Python入门", "价格": 59.9})
storage.add({"标题": "爬虫实战", "价格": 49.9})
storage.save()
七、知识卡
| 方式 | 写入方法 | 特点 |
|---|---|---|
| CSV | csv.DictWriter / pd.to_csv | 通用、Excel可打开 |
| Excel | openpyxl / pd.to_excel | 支持格式化、多sheet |
| JSON | json.dump | 嵌套结构、API友好 |
| JSONL | 逐行json.dumps | 流式写入、大数据 |
| SQLite | sqlite3 | 轻量数据库、SQL查询 |
| MySQL | pymysql | 企业级、多人协作 |
| utf-8-sig | encoding参数 | 中文CSV不乱码 |
八、课后作业
必做题:
- 将爬虫数据分别存为CSV、Excel、JSON
- 用SQLite存储爬虫数据并实现查询
- 编写通用存储管理器
选做题:
- 搭建MySQL存储方案
- 实现断点续存(已爬URL去重)
有问题欢迎评论区留言,大家一起讨论!
标签:Python | 数据存储 | CSV | Excel | MySQL | SQLite | JSON
1997

被折叠的 条评论
为什么被折叠?



