一:实操代码
"""
@Created on : 2026/6/12 13:49
@creator : er_nao
@File :day_91.py
@Description :文本转向量
"""
import dashscope
from dashscope import TextEmbedding
import numpy as np
import json
import pandas as pd
import pdfplumber
import os
from typing import List, Dict
from config import TONGYI_API_KEY
dashscope.api_key = TONGYI_API_KEY
pdf_file_path = "C:\\Users\\hp\\Desktop\\NLP学习数据\\arctle2.pdf"
output_json_path = "C:\\Users\\hp\\Desktop\\NLP学习数据\Day91_文本转向量结果.json"
output_excel_path = "C:\\Users\\hp\\Desktop\\NLP学习数据\Day91_文本转向量结果.xlsx"
chunk_size = 500
overlap_ratio = 0.2
min_para_length= 50
def extract_pdf_text(pdf_path:str) -> str:
if not os.path.exists(pdf_path):
print(f"PDF文件不存在:{pdf_path}")
return ""
try:
full_text_list = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text(x_tolerance =3, y_tolerance=3, keep_blank_chars = False)
if page_text and page_text.strip():
full_text_list.append(page_text)
full_text = "\n".join(full_text_list)
print(f" PDF提取完成,总字符数:{len(full_text)}")
return full_text
except Exception as e:
print(f"PDF提取失败:{str(e)}")
return ""
def split_text_to_paragraphs(full_text:str, min_para_length:int =50) -> List[str]:
if not full_text:
return []
raw_paragraphs = full_text.split("\n\n")
valid_paragraphs = []
for paragraph in raw_paragraphs:
clean_paragraph = paragraph.strip()
if len(clean_paragraph) >= min_para_length:
valid_paragraphs.append(clean_paragraph)
final_paragraphs = []
for paragraph in valid_paragraphs:
if len(paragraph) > 2000:
sub_par = paragraph.split("\n")
for sub_par in sub_par:
clean_sub_par = sub_par.strip()
if len(clean_sub_par) >= min_para_length:
final_paragraphs.append(clean_sub_par)
else:
final_paragraphs.append(paragraph)
print(f"文本分段完成,共 {len(final_paragraphs)} 个有效段落")
return final_paragraphs
def sliding_window_chunking(paragraphs:str,chunk_size:int=500,overlap_radio :float=0.2) -> List[str]:
if not paragraphs:
return []
overlap_length = int(chunk_size * overlap_radio)
step_length = chunk_size - overlap_length
chunk_list =[]
para_length = len(paragraphs)
for start_index in range(0,para_length,step_length):
end_index = min(start_index+chunk_size,para_length)
current_chunk = paragraphs[start_index:end_index]
if current_chunk.strip():
chunk_list.append(current_chunk)
return chunk_list
def batch_chunk_paragraphs(paragraph_list: List[str], chunk_size: int = 500, overlap_ratio: float = 0.2) -> List[Dict]:
"""批量切块,复用Day88的封装函数"""
if not paragraph_list:
return []
all_chunks = []
for para_idx, paragraph in enumerate(paragraph_list):
para_chunks = sliding_window_chunking(paragraph, chunk_size, overlap_ratio)
for chunk_idx, chunk in enumerate(para_chunks):
all_chunks.append({
"paragraph_id": para_idx + 1,
"chunk_id": chunk_idx + 1,
"chunk_content": chunk,
"chunk_length": len(chunk)
})
print(f" 文本切块完成,共 {len(all_chunks)} 个有效文本块")
return all_chunks
def generate_tongyi_embedding(text: str) -> np.ndarray:
"""
调用通义向量模型,给单条文本生成标准化向量
核心优化:自动处理长文本、异常捕获、结果标准化
"""
if not text or not text.strip():
print(" 输入文本为空,跳过向量化")
return None
try:
response = TextEmbedding.call(
model=TextEmbedding.Models.text_embedding_v2,
input=text
)
if response.status_code == 200:
embedding = np.array(response.output["embeddings"][0]["embedding"])
return embedding
else:
print(f" 向量化失败,错误码:{response.status_code},错误信息:{response.message}")
return None
except Exception as e:
print(f" 向量化异常,错误信息:{str(e)}")
return None
def batch_text_to_embedding(chunk_list: List[Dict]) -> List[Dict]:
"""
批量文本转向量核心函数
输入:文本块列表
输出:带向量的标准化结果列表,可直接用于向量库搭建
"""
if not chunk_list:
print(" 输入文本块列表为空")
return []
batch_result = []
total_count = len(chunk_list)
print(f" 开始批量文本转向量,共 {total_count} 个文本块")
for idx, chunk in enumerate(chunk_list):
chunk_content = chunk["chunk_content"]
embedding = generate_tongyi_embedding(chunk_content)
if embedding is not None:
batch_result.append({
"chunk_id": chunk["chunk_id"],
"paragraph_id": chunk["paragraph_id"],
"chunk_content": chunk_content,
"chunk_length": chunk["chunk_length"],
"embedding_vector": embedding.tolist(),
"embedding_dim": len(embedding),
"create_time": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
})
if (idx + 1) % 5 == 0:
print(f"✅ 已完成 {idx + 1}/{total_count} 个文本块的向量化")
print(f"\n🎉 批量文本转向量完成!共成功生成 {len(batch_result)} 个文本块的向量")
return batch_result
def save_embedding_result(result_list: List[Dict], json_path: str, excel_path: str) -> bool:
"""
保存向量化结果,支持JSON和Excel双格式
JSON格式:用于后续向量库搭建,保留完整向量信息
Excel格式:用于人工查看、核对、分析
"""
if not result_list:
print(" 结果列表为空,无需保存")
return False
try:
with open(json_path, "w", encoding="utf-8") as f:
json.dump(result_list, f, ensure_ascii=False, indent=2)
print(f" 向量化结果已保存为JSON文件:{json_path}")
df = pd.DataFrame(result_list)
df["embedding_vector"] = df["embedding_vector"].astype(str)
df.to_excel(excel_path, index=False)
print(f" 向量化结果已保存为Excel文件:{excel_path}")
return True
except Exception as e:
print(f" 结果保存失败:{str(e)}")
return False
def verify_embedding_similarity(result_list: List[Dict], test_text1: str, test_text2: str) -> float:
"""
验证向量相似度,确保生成的向量符合「语义越像、相似度越高」的核心逻辑
"""
from sklearn.metrics.pairwise import cosine_similarity
embedding1 = generate_tongyi_embedding(test_text1).reshape(1, -1)
embedding2 = generate_tongyi_embedding(test_text2).reshape(1, -1)
similarity = cosine_similarity(embedding1, embedding2)[0][0]
print(f"\n向量相似度验证结果:")
print(f"文本1:{test_text1}")
print(f"文本2:{test_text2}")
print(f"语义相似度:{round(similarity, 4)}")
print(
f"验证结论:{'✅ 向量符合预期,语义越像相似度越高' if similarity > 0.7 else '⚠️ 向量相似度偏低,建议检查文本内容'}")
return round(similarity, 4)
def main_text_to_embedding_pipeline():
print("=" * 80)
print(" Day91 文本转向量全流程开始执行")
print("=" * 80)
full_text = extract_pdf_text(pdf_file_path)
if not full_text:
print(" 全流程终止:PDF提取失败")
return
paragraph_list = split_text_to_paragraphs(full_text, min_para_length)
if not paragraph_list:
print(" 全流程终止:文本分段失败")
return
chunk_list = batch_chunk_paragraphs(paragraph_list, chunk_size, overlap_ratio)
if not chunk_list:
print(" 全流程终止:文本切块失败")
return
embedding_result = batch_text_to_embedding(chunk_list)
if not embedding_result:
print(" 全流程终止:文本转向量失败")
return
save_embedding_result(embedding_result, output_json_path, output_excel_path)
verify_embedding_similarity(
embedding_result,
test_text1="Day91的学习内容是文本转向量",
test_text2="今天要学的是RAG项目的文本转向量全流程"
)
print("\n" + "=" * 80)
print(" Day91 文本转向量全流程执行完成!")
print("=" * 80)
print(f" 最终成果:")
print(f" - 成功处理PDF文件:{pdf_file_path}")
print(f" - 生成有效文本块:{len(chunk_list)} 个")
print(f" - 成功生成向量:{len(embedding_result)} 个")
print(f" - 结果文件1:{output_json_path}(用于RAG向量库搭建)")
print(f" - 结果文件2:{output_excel_path}(用于人工查看核对)")
if __name__ == "__main__":
main_text_to_embedding_pipeline()

二:常见报错与解决方案
| 常见报错 | 解决方案 |
|---|
| API-KEY 无效 / 认证失败 | 检查你复制的 API-KEY 是否正确,有没有多余的空格;确认通义千问服务已经开通 |
| 调用额度不足 | 阿里云通义千问有大量免费额度,学习完全够用,若额度不足,可在控制台购买免费额度包 |
| 网络连接超时 | 通义向量 API 是国内阿里云服务器,不会有超时问题,若出现超时,检查你的网络是否正常,有没有开代理 |
| 文本过长报错 | 通义向量 V2 模型最大支持 8192 个 token,若文本过长,代码会自动分块处理,若还是报错,调小你的 chunk_size 参数 |
| 包导入失败 | 重新执行依赖安装命令,用国内清华镜像,确保所有依赖都安装成功 |
三:4.3 核心知识点沉淀(一句话总结,面试 / 求职直接用)
- 文本转向量是 RAG 项目的核心环节,承上启下,连接文本分块和向量库搭建,是 RAG 系统能精准检索的底层基础
- 企业级 RAG 项目中,文本转向量必须是工程化、可复用、可批量处理的,不能是零散的单文本调用
- 向量化结果必须标准化,包含完整的元数据(文本内容、ID、长度、生成时间)和向量信息,可直接用于向量库搭建
- 中文 RAG 场景,通义向量模型是企业级项目的主流选型,中文语义理解能力强,长文本支持好,云端 API 调用稳定,无部署成本
- 向量化完成后,必须做相似度验证,确保生成的向量符合语义逻辑,避免生成无效向量影响后续检索效果