有一次碰到一个c项目,代码不是utf8编码,使用的gbk编码,有的是gb2312编码,很多中文注释乱码,于是使用python写了一个小脚本如下,在源码目录下运行此脚本即可批量的把源码转成utf8编码
import os
import re
import chardet
def show_files(path, all_files):
file_list = os.listdir(path)
for file in file_list:
cur_path = os.path.join(path, file)
if os.path.isdir(cur_path):
show_files(cur_path, all_files)
else:
all_files.append(cur_path)
return all_files
contents = show_files("./", [])
for content in contents:
ret = re.match('.+\.[ch]$', content)
if ret == None:
continue
with open(content, 'rb') as f:
data = f.read()
f.close()
encode = chardet.detect(data)
if encode['encoding'] == 'GB2312':
try:
data_utf8 = data.decode('gb2312').encode('utf8')
with open(content, 'wb') as f:
f.write(data_utf8)
f.close()
except UnicodeDecodeError:
try:
data_utf8 = data.decode('gbk').encode('utf8')
with open(content, 'wb') as f:
f.write(data_utf8)
f.close()
except UnicodeDecodeError:
print("decode content err, %s, %s" %(content, encode['encoding']))
elif encode['encoding'] == 'utf-8' or encode['encoding'] == 'ascii':
pass
elif encode['encoding'] == 'ISO-8859-1':
try:
data_utf8 = data.decode('iso-8859-1').encode('utf8')
with open(content, 'wb') as f:
f.write(data_utf8)
f.close()
except UnicodeDecodeError:
print("decode content err, %s, %s" %(content, encode['encoding']))
else:
print(content)
print(encode['encoding'])
1023

被折叠的 条评论
为什么被折叠?



