Python爬虫实战⑩｜数据存储方案，CSV与Excel与MySQL全搞定

最新推荐文章于 2026-06-18 15:17:50 发布

原创最新推荐文章于 2026-06-18 15:17:50 发布 · 322 阅读

4 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#python #爬虫 #excel

Python爬虫实战专栏收录该内容

30 篇文章

订阅专栏

author: 专注Python实战，分享爬虫与数据分析干货
title: Python爬虫实战⑩｜数据存储方案，CSV/Excel/MySQL全搞定
update: 2026-04-26
tags: Python,爬虫,数据存储,CSV,Excel,MySQL,SQLite,json

作者：专注Python实战，分享爬虫与数据分析干货
更新时间：2026年4月
适合人群：已掌握爬虫抓取、想把数据存下来的开发者

前言：数据存哪？这是爬虫的最后一公里

抓到了数据，存在哪？

存储方式	适合场景	容量
CSV	小数据、Excel打开	百万行以内
Excel	需要格式化展示	100万行以内
JSON	API数据、嵌套结构	灵活
SQLite	单机应用、离线查询	TB级
MySQL	多人协作、线上服务	TB级

今天一次性搞定所有存储方案！

一、CSV存储

1.1 基础写入

import csv

data = [
    {"标题": "Python入门", "价格": 59.9, "评分": 9.2},
    {"标题": "爬虫实战", "价格": 49.9, "评分": 9.0},
    {"标题": "数据分析", "价格": 69.9, "评分": 8.8},
]

# 写入CSV（最基础）
with open("books.csv", "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["标题", "价格", "评分"])
    writer.writeheader()
    writer.writerows(data)

print("写入完成")

1.2 追加写入

# 追加模式（不写表头）
new_data = [
    {"标题": "机器学习", "价格": 79.9, "评分": 9.5},
]

with open("books.csv", "a", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["标题", "价格", "评分"])
    # 检查文件是否为空，决定是否写表头
    import os
    if os.path.getsize("books.csv") == 0:
        writer.writeheader()
    writer.writerows(new_data)

1.3 用Pandas写入（推荐）

import pandas as pd

df = pd.DataFrame(data)

# 写入CSV
df.to_csv("books.csv", index=False, encoding="utf-8-sig")

# 追加
df.to_csv("books.csv", mode="a", header=False, index=False, encoding="utf-8-sig")

二、Excel存储

2.1 openpyxl写入

pip install openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple

from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment

wb = Workbook()
ws = wb.active
ws.title = "商品数据"

# 表头
headers = ["标题", "价格", "评分", "销量"]
ws.append(headers)

# 设置表头样式
header_font = Font(bold=True, color="FFFFFF", size=12)
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
for col in range(1, len(headers) + 1):
    cell = ws.cell(row=1, column=col)
    cell.font = header_font
    cell.fill = header_fill
    cell.alignment = Alignment(horizontal="center")

# 数据
products = [
    ["Python入门", 59.9, 9.2, 1200],
    ["爬虫实战", 49.9, 9.0, 800],
    ["数据分析", 69.9, 8.8, 600],
]

for product in products:
    ws.append(product)

# 自动列宽
for col in ws.columns:
    max_length = max(len(str(cell.value or "")) for cell in col)
    ws.column_dimensions[col[0].column_letter].width = max_length + 4

wb.save("products.xlsx")
print("Excel写入完成")

2.2 用Pandas写入（推荐）

import pandas as pd

df = pd.DataFrame(products, columns=headers)
df.to_excel("products.xlsx", index=False, sheet_name="商品数据")

# 多sheet写入
with pd.ExcelWriter("products.xlsx", engine="openpyxl") as writer:
    df.to_excel(writer, sheet_name="商品", index=False)
    df2.to_excel(writer, sheet_name="订单", index=False)
    df3.to_excel(writer, sheet_name="评论", index=False)

三、JSON存储

3.1 基础写入

import json

data = [
    {"标题": "Python入门", "价格": 59.9, "评分": 9.2},
    {"标题": "爬虫实战", "价格": 49.9, "评分": 9.0},
]

# 写入JSON
with open("books.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# 读取JSON
with open("books.json", "r", encoding="utf-8") as f:
    loaded = json.load(f)
    print(f"读取到 {len(loaded)} 条数据")

3.2 JSON Lines（大数据推荐）

# 写入JSONL（每行一条，适合流式写入）
with open("books.jsonl", "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# 读取JSONL
with open("books.jsonl", "r", encoding="utf-8") as f:
    loaded = [json.loads(line) for line in f if line.strip()]

四、SQLite存储

4.1 基础操作

import sqlite3

# 创建/连接数据库
conn = sqlite3.connect("crawler.db")
cursor = conn.cursor()

# 创建表
cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    price REAL,
    rating REAL,
    url TEXT UNIQUE,
    crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")

# 插入数据
cursor.execute("""
INSERT OR IGNORE INTO products (title, price, rating, url)
VALUES (?, ?, ?, ?)
""", ("Python入门", 59.9, 9.2, "https://example.com/1"))

# 批量插入
products = [
    ("爬虫实战", 49.9, 9.0, "https://example.com/2"),
    ("数据分析", 69.9, 8.8, "https://example.com/3"),
]
cursor.executemany("""
INSERT OR IGNORE INTO products (title, price, rating, url)
VALUES (?, ?, ?, ?)
""", products)

conn.commit()

# 查询
cursor.execute("SELECT * FROM products WHERE rating > 9.0")
results = cursor.fetchall()
for row in results:
    print(row)

conn.close()

4.2 爬虫SQLite存储类

import sqlite3
from datetime import datetime

class CrawlerDB:
    """爬虫数据库管理"""

    def __init__(self, db_path="crawler.db"):
        self.conn = sqlite3.connect(db_path)
        self.cursor = self.conn.cursor()
        self._create_tables()

    def _create_tables(self):
        self.cursor.execute("""
        CREATE TABLE IF NOT EXISTS pages (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT UNIQUE,
            title TEXT,
            content TEXT,
            crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
        """)
        self.conn.commit()

    def insert(self, url, title, content):
        try:
            self.cursor.execute("""
            INSERT OR REPLACE INTO pages (url, title, content)
            VALUES (?, ?, ?)
            """, (url, title, content))
            self.conn.commit()
            return True
        except Exception as e:
            print(f"插入失败: {e}")
            return False

    def exists(self, url):
        self.cursor.execute("SELECT 1 FROM pages WHERE url=?", (url,))
        return self.cursor.fetchone() is not None

    def get_unvisited(self, limit=100):
        self.cursor.execute("""
        SELECT url FROM pages WHERE content IS NULL LIMIT ?
        """, (limit,))
        return [row[0] for row in self.cursor.fetchall()]

    def close(self):
        self.conn.close()

五、MySQL存储

5.1 安装依赖

pip install pymysql -i https://pypi.tuna.tsinghua.edu.cn/simple

5.2 MySQL存储类

import pymysql
from datetime import datetime

class MySQLStorage:
    """MySQL存储"""

    def __init__(self, host="localhost", port=3306,
                 user="root", password="", database="crawler_db"):
        self.conn = pymysql.connect(
            host=host, port=port, user=user,
            password=password, database=database,
            charset="utf8mb4", autocommit=False,
        )
        self.cursor = self.conn.cursor()
        self._create_table()

    def _create_table(self):
        self.cursor.execute("""
        CREATE TABLE IF NOT EXISTS products (
            id INT AUTO_INCREMENT PRIMARY KEY,
            title VARCHAR(500) NOT NULL,
            price DECIMAL(10,2),
            rating DECIMAL(3,1),
            url VARCHAR(1000) UNIQUE,
            source VARCHAR(100),
            crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            INDEX idx_rating (rating),
            INDEX idx_source (source)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        """)
        self.conn.commit()

    def insert(self, item):
        try:
            self.cursor.execute("""
            INSERT INTO products (title, price, rating, url, source)
            VALUES (%s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE
                price=VALUES(price), rating=VALUES(rating)
            """, (item["title"], item.get("price"), item.get("rating"),
                  item.get("url"), item.get("source")))
            self.conn.commit()
            return True
        except Exception as e:
            self.conn.rollback()
            print(f"MySQL插入失败: {e}")
            return False

    def batch_insert(self, items):
        """批量插入"""
        try:
            data = [(i["title"], i.get("price"), i.get("rating"),
                     i.get("url"), i.get("source")) for i in items]
            self.cursor.executemany("""
            INSERT INTO products (title, price, rating, url, source)
            VALUES (%s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE
                price=VALUES(price), rating=VALUES(rating)
            """, data)
            self.conn.commit()
            print(f"批量插入 {len(items)} 条")
            return True
        except Exception as e:
            self.conn.rollback()
            print(f"批量插入失败: {e}")
            return False

    def query(self, sql, params=None):
        self.cursor.execute(sql, params)
        return self.cursor.fetchall()

    def close(self):
        self.cursor.close()
        self.conn.close()

六、通用存储管理器

import csv
import json
import pandas as pd
from datetime import datetime

class DataStorage:
    """通用数据存储管理器"""

    def __init__(self, filename, format="csv"):
        self.filename = filename
        self.format = format
        self.data = []

    def add(self, item):
        """添加一条数据"""
        item["_crawl_time"] = datetime.now().isoformat()
        self.data.append(item)

    def add_many(self, items):
        """批量添加"""
        for item in items:
            self.add(item)

    def save(self):
        """保存数据"""
        if not self.data:
            print("没有数据需要保存")
            return

        if self.format == "csv":
            self._save_csv()
        elif self.format == "json":
            self._save_json()
        elif self.format == "excel":
            self._save_excel()
        else:
            raise ValueError(f"不支持的格式: {self.format}")

    def _save_csv(self):
        df = pd.DataFrame(self.data)
        df.to_csv(self.filename, index=False, encoding="utf-8-sig")
        print(f"已保存 {len(self.data)} 条到 {self.filename}")

    def _save_json(self):
        with open(self.filename, "w", encoding="utf-8") as f:
            json.dump(self.data, f, ensure_ascii=False, indent=2)
        print(f"已保存 {len(self.data)} 条到 {self.filename}")

    def _save_excel(self):
        df = pd.DataFrame(self.data)
        df.to_excel(self.filename, index=False)
        print(f"已保存 {len(self.data)} 条到 {self.filename}")


# 使用示例
storage = DataStorage("products.csv", format="csv")
storage.add({"标题": "Python入门", "价格": 59.9})
storage.add({"标题": "爬虫实战", "价格": 49.9})
storage.save()

七、知识卡

方式	写入方法	特点
CSV	csv.DictWriter / pd.to_csv	通用、Excel可打开
Excel	openpyxl / pd.to_excel	支持格式化、多sheet
JSON	json.dump	嵌套结构、API友好
JSONL	逐行json.dumps	流式写入、大数据
SQLite	sqlite3	轻量数据库、SQL查询
MySQL	pymysql	企业级、多人协作
utf-8-sig	encoding参数	中文CSV不乱码