把非utf8编码C语言源文件转成utf8编码的小工具

原创已于 2025-09-01 10:14:20 修改 · 236 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

收录于

python

于 2025-08-18 16:26:20 首次发布

有一次碰到一个c项目，代码不是utf8编码，使用的gbk编码，有的是gb2312编码，很多中文注释乱码，于是使用python写了一个小脚本如下，在源码目录下运行此脚本即可批量的把源码转成utf8编码

import os
import re
import chardet

def show_files(path, all_files):
    file_list = os.listdir(path)
    for file in file_list:
        cur_path = os.path.join(path, file)
        if os.path.isdir(cur_path):
            show_files(cur_path, all_files)
        else:
            all_files.append(cur_path)

    return all_files

contents = show_files("./", [])
for content in contents:
    ret = re.match('.+\.[ch]$', content)
    if ret == None:
        continue

    with open(content, 'rb') as f:
        data = f.read()
        f.close()
        encode = chardet.detect(data)
        if encode['encoding'] == 'GB2312':
            try:
                data_utf8 = data.decode('gb2312').encode('utf8')
                with open(content, 'wb') as f:
                    f.write(data_utf8)
                    f.close()
            except UnicodeDecodeError:
                try:
                    data_utf8 = data.decode('gbk').encode('utf8')
                    with open(content, 'wb') as f:
                        f.write(data_utf8)
                        f.close()
                except UnicodeDecodeError:
                    print("decode content err, %s, %s" %(content, encode['encoding']))
        elif encode['encoding'] == 'utf-8' or encode['encoding'] == 'ascii':
            pass
        elif encode['encoding'] == 'ISO-8859-1':
            try:
                data_utf8 = data.decode('iso-8859-1').encode('utf8')
                with open(content, 'wb') as f:
                    f.write(data_utf8)
                    f.close()
            except UnicodeDecodeError:
                print("decode content err, %s, %s" %(content, encode['encoding']))
        else:
            print(content)
            print(encode['encoding'])

标签

#windows #服务器 #linux