python实现word文档比对的功能较简单,笔者这里将其界面话,可以指定输入比对的文档,相似度,最小相似参数等。输出的结果以word的形式保存,重复部分会标出,基本实现了商业软件的功能。
先看界面

这里不废话了,直接给出全部源码,觉得好的点个赞。程序打包的话,自己百度。
from tkinter import Tk, Button, Label, filedialog, Entry, Frame, TOP, LEFT, RIGHT, X, HORIZONTAL
from tkinter.ttk import Progressbar
from tkinter import messagebox
from docx import Document
from docx.shared import RGBColor, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.enum.text import WD_COLOR_INDEX
import re, datetime
def getText(wordname):
d = Document(wordname)
texts = []
for para in d.paragraphs:
texts.append(para.text)
return texts
def is_Chinese(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def msplit(s, seperators=',|\.|\?|,|。|?|!'):
return re.split(seperators, s)
def readDocx(docfile):
print('*' * 80)
print('文件', docfile, '加载中……')
t1 = datetime.datetime.now()
paras = getText(docfile)
segs = []
for p in paras:
temp = []
for s in msplit(p):
if len(s) > 2:
temp.append(s.replace(' ', ""))
if len(temp) > 0:
segs.append(temp)
t2 = datetime.datetime.now()
print('加载完成,用时: ', t2 - t1)
return segs
def compareParagraph(doc1, i, doc2, j, filter_doc, min_segment=5, min_same_chars=10, min_similarity_ratio=0.5):
p1 = doc1[i]
p2 = doc2[j]
len1 = sum([len(s) for s in p1])
len2 = sum([len(s) for s in p2])
if len1 < min_same_chars or len2 < min_same_chars:
return {}
same_characters = []
for s1 in p1:
if len(s1) < min_segment:
continue
for s2 in p2:
if len(s2) < min_segment:
continue
if s2 in s1 and not any(s2 in p for p in filter_doc):
same_characters.append(s2)
elif s1 in s2 and not any(s1 in p for p in filter_doc):
same_characters.append(s1)
count = sum([len(s) for s in same_characters])
ratio = float(count) / min(len1, len2)
if count > min_same_chars and ratio > min_similarity_ratio:
return {
'same_characters': same_characters,
'same_chars_count': count,
'similarity_ratio': ratio
}
else:
return {}
def set_font(cell):
cell.paragraphs[0].runs[0].font.name = "Times New Roman" #设置英文字体
cell.paragraphs[0].runs[0].font.size = Pt(12) # 字体大小
cell.paragraphs[0].runs[0]._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体') #设置中文字体
def create_word_table(output_file_path, comparison_results):
doc = Document()
table = doc.add_table(rows=1, cols=6)
table.style = "Table Grid"
hdr_cells = table.rows[0].cells
hdr_cells[0].width=Pt(15)
hdr_cells[0].text = '序号'
set_font(hdr_cells[0])
hdr_cells[1].width=Pt(180)
hdr_cells[1].text = '第一个文档段落'
set_font(hdr_cells[1])
hdr_cells[2].width=Pt(5)
hdr_cells[2].text = '页数'
set_font(hdr_cells[2])
hdr_cells[3].width=Pt(180)
hdr_cells[3].text = '第二个文档段落'
set_font(hdr_cells[3])
hdr_cells[4].width=Pt(5)
hdr_cells[4].text = '页数'
set_font(hdr_cells[4])
hdr_cells[5].width=Pt(15)
hdr_cells[5].text = '重复的字数'
set_font(hdr_cells[5])
hdr_cells[6].width=Pt(15)
hdr_cells[6].text = '文字重复率'
set_font(hdr_cells[6])
for idx, result in enumerate(comparison_results, start=1):
row_cells = table.add_row().cells
row_cells[0].text = str(idx)
set_font(row_cells[0])
#row_cells[0].style.font.name="宋体"
row_cells[1].text = str(",".join(result['doc1_paragraph']))
for run in row_cells[1].paragraphs[0].runs:
for sentence in result['same_characters']:
word=sentence
rerun=run.text.split(word) #切片,得到一个列表rerun
run.text = "" #对run的文本清空处理
for text in rerun[:-1]: #不是最后一个元素的情况
if text =="" :
run = row_cells[1].paragraphs[0].add_run(word)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
else:
run = row_cells[1].paragraphs[0].add_run(text)
run = row_cells[1].paragraphs[0].add_run(word)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
if rerun[-1] == "": #最后一个元素的情况
run = row_cells[1].paragraphs[0].add_run(word)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
else:
run = row_cells[1].paragraphs[0].add_run(rerun[-1])
set_font(row_cells[1])
row_cells[2].text = str(",".join(result['doc2_paragraph']))
for run in row_cells[2].paragraphs[0].runs:
for sentence in result['same_characters']:
word=sentence
rerun=run.text.split(word) #切片,得到一个列表rerun
run.text = "" #对run的文本清空处理
for text in rerun[:-1]: #不是最后一个元素的情况
if text =="" :
run = row_cells[2].paragraphs[0].add_run(word)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
else:
run = row_cells[2].paragraphs[0].add_run(text)
run = row_cells[2].paragraphs[0].add_run(word)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
if rerun[-1] == "": #最后一个元素的情况
run = row_cells[2].paragraphs[0].add_run(word)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
else:
run = row_cells[2].paragraphs[0].add_run(rerun[-1])
set_font(row_cells[2])
row_cells[5].text = str(result['same_chars_count'])
set_font(row_cells[5])
row_cells[6].text = "{:.2f}%".format(result['similarity_ratio'] * 100)
set_font(row_cells[6])
doc.save(output_file_path)
def selectFile(label):
filename = filedialog.askopenfilename()
label.delete(0, "end")
label.insert(0, filename)
return filename
def save_file(label):
file_path = filedialog.asksaveasfilename(defaultextension='.docx',
filetypes=[("Word files", "*.docx"), ("All files", "*.*")])
if file_path:
label.delete(0, "end")
label.insert(0, file_path)
print(f"File path chosen: {file_path}")
def startComparison(doc1_entry, doc2_entry, filter_entry, output_entry, min_chars_entry, min_ratio_entry, progress_bar):
d1 = doc1_entry.get()
d2 = doc2_entry.get()
filter_doc = filter_entry.get()
doc1 = readDocx(d1)
doc2 = readDocx(d2)
if filter_doc:
doc3 = readDocx(filter_doc)
else:
doc3 = []
min_same_chars = int(min_chars_entry.get())
min_similarity_ratio = float(min_ratio_entry.get())
output_file_path = output_entry.get()
if output_file_path == "":
output_file_path = f"compare_{d1.split('/')[-1]}_{d2.split('/')[-1]}.docx"
comparison_results = []
total_steps = len(doc1)
current_step = 0
for i, p1 in enumerate(doc1):
current_step += 1
progress_value = int((current_step / total_steps) * 100)
progress_bar["value"] = progress_value
progress_bar.update()
#if i % 100 == 0:
# print('处理进行中,已处理段落 {0:>4d} (总数 {1:0>4d} )'.format(i, len(doc1)))
for j, p2 in enumerate(doc2):
comparison_result = compareParagraph(doc1, i, doc2, j, doc3, min_same_chars, min_similarity_ratio)
if comparison_result:
comparison_results.append({
'doc1_paragraph': p1,
'doc2_paragraph': p2,
'same_characters': comparison_result['same_characters'],
'same_chars_count': comparison_result['same_chars_count'],
'similarity_ratio': comparison_result['similarity_ratio']
})
create_word_table(output_file_path, comparison_results)
progress_bar["value"] = 100
progress_bar.update()
messagebox.showinfo("提示", "比对完成")
print("比对完成")
if __name__ == "__main__":
root = Tk()
root.title("文档比对工具")
root.geometry("600x196")
doc1_frame = Frame(root)
doc1_frame.pack(fill=X)
doc1_label = Label(doc1_frame, text="请选择第一个文档:")
doc1_label.pack(side=LEFT, padx=(5, 0))
doc1_entry = Entry(doc1_frame,width=61)
doc1_entry.pack(side=LEFT)
doc1_button = Button(doc1_frame, text="选择", command=lambda: selectFile(doc1_entry))
doc1_button.pack(side=RIGHT, padx=(0, 5))
doc2_frame = Frame(root)
doc2_frame.pack(fill=X)
doc2_label = Label(doc2_frame, text="请选择第二个文档:")
doc2_label.pack(side=LEFT, padx=(5, 0))
doc2_entry = Entry(doc2_frame,width=61)
doc2_entry.pack(side=LEFT)
doc2_button = Button(doc2_frame, text="选择", command=lambda: selectFile(doc2_entry))
doc2_button.pack(side=RIGHT, padx=(0, 5))
filter_frame = Frame(root)
filter_frame.pack(fill=X)
filter_label = Label(filter_frame, text="请选择过滤的文档:")
filter_label.pack(side=LEFT, padx=(5, 0))
filter_entry = Entry(filter_frame,width=61)
filter_entry.pack(side=LEFT)
filter_button = Button(filter_frame, text="选择", command=lambda: selectFile(filter_entry))
filter_button.pack(side=RIGHT, padx=(0, 5))
output_frame = Frame(root)
output_frame.pack(fill=X)
output_label = Label(output_frame, text="输出文件路径:")
output_label.pack(side=LEFT, padx=(29, 0))
output_entry = Entry(output_frame,width=61)
output_entry.pack(side=LEFT)
output_button = Button(output_frame, text="选择", command=lambda: save_file(output_entry))
output_button.pack(side=RIGHT, padx=(0, 5))
min_chars_frame = Frame(root)
min_chars_frame.pack(fill=X)
min_chars_label = Label(min_chars_frame, text="最小相同字符数:")
min_chars_label.pack(side=LEFT, padx=(17, 0))
min_chars_entry = Entry(min_chars_frame,width=23)
min_chars_entry.insert(0, "10")
min_chars_entry.pack(side=LEFT)
#min_ratio_frame = Frame(root)
min_chars_frame.pack(fill=X)
min_ratio_label = Label(min_chars_frame, text="最小相似度比率:")
min_ratio_label.pack(side=LEFT)
min_ratio_entry = Entry(min_chars_frame,width=23)
min_ratio_entry.insert(0, "0.5")
min_ratio_entry.pack(side=LEFT)
progress_bar = Progressbar(root, orient=HORIZONTAL, length=480, mode='determinate')
progress_bar.pack(fill=X)
compare_button = Button(root, text="开始比对")
compare_button.pack(fill=X)
compare_button.config(command=lambda: startComparison(doc1_entry, doc2_entry, filter_entry, output_entry, min_chars_entry, min_ratio_entry, progress_bar))
root.mainloop()

被折叠的 条评论
为什么被折叠?



