diff --git a/concat2.py b/concat2.py new file mode 100644 index 0000000..c647b6a --- /dev/null +++ b/concat2.py @@ -0,0 +1,59 @@ +import xlrd +import xlsxwriter +import os + +path = "./data(3)/SplitExcel/" + +def get_allxls(): # 获取excel文件列表 + all_xls = [] + for f in os.listdir(path): + f_name = path + f + all_xls.append(f_name) + return all_xls + +def open_xls(file): # 打开一个excel + fh = xlrd.open_workbook(file) + return fh + +def getsheet(fh): # 获取excel表中的所有sheet + return fh.sheets() + +def getnrows(fh, sheet): # 获取sheet表中的行数 + table = fh.sheets()[sheet] + return table.nrows + +def getFilect(file, shnum): # 读取文件内容并返回内容 + fh = open_xls(file) + table = fh.sheets()[shnum] + num = table.nrows + for row in range(1,num): + rdata = table.row_values(row) + datavalue.append(rdata) + return datavalue + +def getshnum(fh): # 获取sheet表的个数 + x = 0 + sh = getsheet(fh) + for sheet in sh: + x += 1 + return x + +if __name__ == '__main__': + allxls = get_allxls() # 定义要合并的excel文件列表 + datavalue = [] + for fl in allxls: # 存储所有读取的结果 + fh = open_xls(fl) + x = getshnum(fh) + for shnum in range(x): + print("正在读取文件:" + str(fl) + "的第" + str(shnum) + "个sheet表的内容...") + rvalue = getFilect(fl, shnum) + endfile = "./data(3)/Supplemental_Dataset_1(4).xlsx" # 合并后的文件 + wb1 = xlsxwriter.Workbook(endfile) + + ws = wb1.add_worksheet() + for a in range(len(rvalue)): + for b in range(len(rvalue[a])): + c = rvalue[a][b] + ws.write(a, b, c) + wb1.close() + print("excel合并完成") diff --git a/conver_format.py b/conver_format.py new file mode 100644 index 0000000..1574a4c --- /dev/null +++ b/conver_format.py @@ -0,0 +1,42 @@ +#-*- coding:utf-8 -*- + +import os +import pandas as pd +import numpy as np + +""" +批量将sample_name列的数据强制转换成string类型 +""" + +def convert(src_data): + df = pd.read_excel(src_data, keep_default_na=False) + df[['sample_name']]=df[['sample_name']].astype('str') # 将sample_name列的数据强制转换成string类型 + return df + + +if __name__=="__main__": + path1 = './data(4)/txt_data(2)/' # 全部源数据文件所在的文件夹路径 + fileList = os.listdir(path1) # 文件夹下面所有的文件 + for i in range(len(fileList)): + try: + src_data = os.path.join(path1, fileList[i]) + df=convert(src_data) + print(type(df.at[5, 'sample_name'])) + writer = pd.ExcelWriter(src_data, engine='xlsxwriter') + df.to_excel(writer, index=False) + writer.save() + print("第{0}个文件已经转换完毕,一共有{1}个文件".format(i,len(fileList))) + except Exception as result: + print("{0}文件有问题".format(fileList[i])) + continue + + + + # src_data="./data(4)/txt_data(2)/P_10916_80974775_raw_meta.xlsx" + # df=convert(src_data) + # print("================") + # print(type(df.at[5, 'sample_name'])) + # writer = pd.ExcelWriter(src_data, engine='xlsxwriter') + # df.to_excel(writer, index=False) + # writer.save() + # print(df) diff --git a/data(2)/dst_P_1634.xlsx b/data(2)/dst_P_1634.xlsx new file mode 100644 index 0000000..717bfa7 Binary files /dev/null and b/data(2)/dst_P_1634.xlsx differ diff --git a/data(2)/test.xlsx b/data(2)/test.xlsx new file mode 100644 index 0000000..83f15f8 Binary files /dev/null and b/data(2)/test.xlsx differ diff --git a/data(2)/test2.xlsx b/data(2)/test2.xlsx new file mode 100644 index 0000000..c9ca1b5 Binary files /dev/null and b/data(2)/test2.xlsx differ diff --git a/data(2)/test3.xlsx b/data(2)/test3.xlsx new file mode 100644 index 0000000..0db7036 Binary files /dev/null and b/data(2)/test3.xlsx differ diff --git a/data(2)/txt_data(2)/P_10955_45204796_raw_meta.xlsx b/data(2)/txt_data(2)/P_10955_45204796_raw_meta.xlsx new file mode 100644 index 0000000..ca29b67 Binary files /dev/null and b/data(2)/txt_data(2)/P_10955_45204796_raw_meta.xlsx differ diff --git a/data(2)/txt_data(2)/P_1841_78606354_raw_meta.xlsx b/data(2)/txt_data(2)/P_1841_78606354_raw_meta.xlsx new file mode 100644 index 0000000..4b29107 Binary files /dev/null and b/data(2)/txt_data(2)/P_1841_78606354_raw_meta.xlsx differ diff --git a/data(2)/txt_data(2)/P_391_92367644_raw_meta.xlsx b/data(2)/txt_data(2)/P_391_92367644_raw_meta.xlsx new file mode 100644 index 0000000..303515e Binary files /dev/null and b/data(2)/txt_data(2)/P_391_92367644_raw_meta.xlsx differ diff --git a/data(2)/txt_data(2)/P_393_29175597_raw_meta.xlsx b/data(2)/txt_data(2)/P_393_29175597_raw_meta.xlsx new file mode 100644 index 0000000..4b052a5 Binary files /dev/null and b/data(2)/txt_data(2)/P_393_29175597_raw_meta.xlsx differ diff --git a/data(2)/txt_data(2)/P_524_83782505_raw_meta.xlsx b/data(2)/txt_data(2)/P_524_83782505_raw_meta.xlsx new file mode 100644 index 0000000..1d1a2c7 Binary files /dev/null and b/data(2)/txt_data(2)/P_524_83782505_raw_meta.xlsx differ diff --git a/data(2)/txt_data(2)/P_810_41824312_raw_meta.xlsx b/data(2)/txt_data(2)/P_810_41824312_raw_meta.xlsx new file mode 100644 index 0000000..76ce497 Binary files /dev/null and b/data(2)/txt_data(2)/P_810_41824312_raw_meta.xlsx differ diff --git a/data(2)/txt_data(2)/P_945_31683047_raw_meta.xlsx b/data(2)/txt_data(2)/P_945_31683047_raw_meta.xlsx new file mode 100644 index 0000000..5c05efa Binary files /dev/null and b/data(2)/txt_data(2)/P_945_31683047_raw_meta.xlsx differ diff --git a/data(2)/txt_data(3)/P_10955_45204796_raw_meta.xlsx b/data(2)/txt_data(3)/P_10955_45204796_raw_meta.xlsx new file mode 100644 index 0000000..ca29b67 Binary files /dev/null and b/data(2)/txt_data(3)/P_10955_45204796_raw_meta.xlsx differ diff --git a/data(2)/txt_data(3)/P_524_83782505_raw_meta.xlsx b/data(2)/txt_data(3)/P_524_83782505_raw_meta.xlsx new file mode 100644 index 0000000..bddceb0 Binary files /dev/null and b/data(2)/txt_data(3)/P_524_83782505_raw_meta.xlsx differ diff --git a/data(2)/txt_data(3)/P_810_41824312_raw_meta.xlsx b/data(2)/txt_data(3)/P_810_41824312_raw_meta.xlsx new file mode 100644 index 0000000..76ce497 Binary files /dev/null and b/data(2)/txt_data(3)/P_810_41824312_raw_meta.xlsx differ diff --git a/data(2)/txt_data(4)/P_1629_62774027_raw_meta.xlsx b/data(2)/txt_data(4)/P_1629_62774027_raw_meta.xlsx new file mode 100644 index 0000000..12efa6e Binary files /dev/null and b/data(2)/txt_data(4)/P_1629_62774027_raw_meta.xlsx differ diff --git a/data(2)/txt_data(4)/P_1632_16610003_raw_meta.xlsx b/data(2)/txt_data(4)/P_1632_16610003_raw_meta.xlsx new file mode 100644 index 0000000..09baeb9 Binary files /dev/null and b/data(2)/txt_data(4)/P_1632_16610003_raw_meta.xlsx differ diff --git a/data(2)/txt_data(4)/P_1634_28155060_raw_meta.xlsx b/data(2)/txt_data(4)/P_1634_28155060_raw_meta.xlsx new file mode 100644 index 0000000..d0e0268 Binary files /dev/null and b/data(2)/txt_data(4)/P_1634_28155060_raw_meta.xlsx differ diff --git a/data(2)/txt_data(4)/P_1642_9689365_raw_meta.xlsx b/data(2)/txt_data(4)/P_1642_9689365_raw_meta.xlsx new file mode 100644 index 0000000..072868a Binary files /dev/null and b/data(2)/txt_data(4)/P_1642_9689365_raw_meta.xlsx differ diff --git a/data(2)/txt_data(4)/P_1653_87032667_raw_meta.xlsx b/data(2)/txt_data(4)/P_1653_87032667_raw_meta.xlsx new file mode 100644 index 0000000..8c21614 Binary files /dev/null and b/data(2)/txt_data(4)/P_1653_87032667_raw_meta.xlsx differ diff --git a/file_copy.py b/file_copy.py new file mode 100644 index 0000000..58ecedd --- /dev/null +++ b/file_copy.py @@ -0,0 +1,34 @@ +# 导入需要的库 +import os +import pandas as pd +import shutil +import stat + + +def copyFiles(data,sourceDir,targetDir): + data_list = list(data['Proj'].drop_duplicates()) # 去重处理 + for item in data_list: + file_name=item+'_.xlsx' + sourceFile = os.path.join(sourceDir, file_name) + targetFile = os.path.join(targetDir,file_name) + + #如果是文件则处理 + if os.path.isfile(sourceFile): + #如果目的路径不存在该文件就创建空文件,并保持目录层级结构 + if not os.path.exists(targetDir): + os.makedirs(targetDir) + #如果目的路径里面不存在某个文件或者存在那个同名文件但是文件有残缺,则复制,否则跳过 + if not os.path.exists(targetFile) or (os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(sourceFile))): + open(targetFile, "wb").write(open(sourceFile, "rb").read()) + print( targetFile+" copy succeeded") + + #如果是文件夹则递归 + if os.path.isdir(sourceFile): + copyFiles(sourceFile, targetFile) + +if __name__ =="__main__": + sourceDir='./data(3)/SplitExcel(2)/' + targetDir='./data(3)/target(2)/' + data_source='./data(3)/positive_20210518.xlsx' + data = pd.read_excel(data_source, keep_default_na=False) + copyFiles(data,sourceDir,targetDir) diff --git a/fill_gis3.py b/fill_gis3.py new file mode 100644 index 0000000..b1a3406 --- /dev/null +++ b/fill_gis3.py @@ -0,0 +1,41 @@ +#-*- coding = utf-8 -*- + +import pandas as pd +import os + + +if __name__=='__main__': + sourceDir='./data(3)/target(3)/' + dst_file='./data(3)/test.xlsx' + dst = pd.read_excel(dst_file,keep_default_na=False) + data_list = list(dst['Proj'].drop_duplicates()) # 去重处理 + longth = len(data_list) + # print(longth) + # dst = pd.read_excel(dst_file, keep_default_na=False) + count=0 + for item in data_list: + file_name=item+'_.xlsx' + source_data=os.path.join(sourceDir, file_name) + src=pd.read_excel(source_data,keep_default_na=False) + try: + for s in src.index[0:]: # 遍历单个源文件中的数据 + ID=src.at[s,'ID'] + for d in dst.index[0:]: + _ID=dst.at[d,'ID'] + if ID==_ID: + dst.at[d, 'Latitude'] = src.at[s, 'Latitude'] + dst.at[d, 'Longitude'] = src.at[s, 'Longitude'] + except Exception as result: + print("{0}文件有问题".format(file_name)) + continue + count+=1 + print('The outermost loop has been traversed for {0} times,a total of {1} times'.format(count,longth)) + + # 用XlsxWriter创建一个pandas的excel表格 + writer = pd.ExcelWriter(dst_file, engine='xlsxwriter') + # 把DataFrame转换成XlsxWriter Excel对象 + dst.to_excel(writer, sheet_name='positive', index=False) + # workbook = writer.book + # worksheet = writer.sheets['positive'] + writer.save() + print("finished!") \ No newline at end of file diff --git a/fill_na.py b/fill_na.py new file mode 100644 index 0000000..b718467 --- /dev/null +++ b/fill_na.py @@ -0,0 +1,21 @@ +import os +import pandas as pd + +src_data='./data(3)/Supplemental_Dataset_1(2).xlsx' +src=pd.read_excel(src_data,keep_default_na=False) + + +src['Longitude']='NA' +src['Latitude']='NA' +# print(src.head(8)) + +# print(src.head(3)) + +# Create a Pandas Excel writer using XlsxWriter as the engine. +writer = pd.ExcelWriter(src_data, engine='xlsxwriter') + +# Convert the dataframe to an XlsxWriter Excel object. +src.to_excel(writer, sheet_name='sample_all',index=False) + +# Close the Pandas Excel writer and output the Excel file. +writer.save() \ No newline at end of file diff --git a/format_conver.py b/format_conver.py new file mode 100644 index 0000000..1f896c3 --- /dev/null +++ b/format_conver.py @@ -0,0 +1,27 @@ +#encoding: utf-8 +from ctypes import * +import time +import win32com.client as win32 +import os + +# 批量格式转换。文件夹下之前的txt文件虽然改了后缀名后表面变成了xlsx文件,但是其本质仍然不是xlsx文件,需要将其另存为xlsx格式 +# 故此函数的作用是将文件夹中的文件全部另存为xlsx格式的文件,这样便能为之后用pandas处理数据做准备 + +def transform(parent_path,out_path): + fileList = os.listdir(parent_path) #文件夹下面所有的文件 + num = len(fileList) + for i in range(num): + file_Name = os.path.splitext(fileList[i]) #文件和格式分开 + if file_Name[1] == '.xlsx': + transfile1 = parent_path+'\\'+fileList[i] #要转换的excel + transfile2 = out_path+'\\'+file_Name[0] #转换出来excel + excel=win32.gencache.EnsureDispatch('excel.application') + pro=excel.Workbooks.Open(transfile1) #打开要转换的excel + pro.SaveAs(transfile2 + ".xlsx", FileFormat=51) # 另存为xlsx格式 + pro.Close() + excel.Application.Quit() + +if __name__=='__main__': + path1=r"E:\研究生\stage1\pro1\data(1)\txt_data" #待转换文件所在目录 + path2=r"E:\研究生\stage1\pro1\data(1)\txt_data(2)" #转换文件存放目录 + transform(path1, path2) diff --git a/format_conver2.py b/format_conver2.py new file mode 100644 index 0000000..5d2afb6 --- /dev/null +++ b/format_conver2.py @@ -0,0 +1,32 @@ +# file_affilication = open('Affiliations.txt','r') +import xlwt +import os +import sys + + +def txt_xls(filename, xlsname): + try: + f = open(filename) + xls = xlwt.Workbook() + # 生成excel的方法,声明excel + sheet = xls.add_sheet('sheet', cell_overwrite_ok=True) + x = 0 # 在excel开始写的位置(y) + + while True: # 循环读取文本里面的内容 + line = f.readline() # 一行一行的读 + if not line: # 如果没有内容,则退出循环 + break + for i in range(len(line.split('\t'))): # \t即tab健分隔 + item = line.split('\t')[i] + sheet.write(x, i, item) # x单元格经度,i单元格纬度 + x += 1 # 另起一行 + f.close() + xls.save(xlsname) # 保存为xls文件 + except: + raise + + +if __name__ == '__main__': + filename = './data(4)/P_101_9743352_raw_meta.txt' + xlsname = './data(4)/P_101_9743352_raw_meta.xlsx' + txt_xls(filename, xlsname) \ No newline at end of file diff --git a/sep_dst.py b/sep_dst.py new file mode 100644 index 0000000..16cc990 --- /dev/null +++ b/sep_dst.py @@ -0,0 +1,24 @@ +#-*- coding = utf-8 -*- + +import pandas as pd +import os + +""" +将目标文件Supplemental_Dataset_1(3).xlsx拆分成一个个小的文件, +相同项目号的数据在一个文件P_77_.xlsx +""" + +data = pd.read_excel('./data(3)/Supplemental_Dataset_1(3).xlsx',keep_default_na=False) +data_list = list(data['Proj'].drop_duplicates()) # 去重处理 +longth = len(data_list) +# print(data_list) +# print(longth) +path = './data(3)/SplitExcel(2)/' # 将拆分的Excel文件保存到此目录下 +if not os.path.exists(path): # 当前文件夹下是否有此文件夹 + os.mkdir(path) # 创建此文件夹 +# count = 0 +for item in data_list: # 遍历Proj列表,按照名字将目的文件分成一个个小的Excel + data_select = data[data['Proj']==item] # 选出Proj列相同的值为一组组成DataFrame + file_name=item+'_.xlsx' + currentdir=os.path.join(path, file_name) + data_select.to_excel(currentdir,index=False) diff --git a/split_file.py b/split_file.py new file mode 100644 index 0000000..ad9720e --- /dev/null +++ b/split_file.py @@ -0,0 +1,44 @@ +import os +import shutil + +def splitFile(src,dst1,dst2): + """ + 将文件夹中的.txt文件和.gz文件分别放到不同的文件夹 + :param src: 原数据存放的文件夹 + :param dst1: 存放.txt文件的文件夹 + :param dst2: 存放.gz文件的文件夹 + :return: None + """ + + # 将两种文件名分别放到不同的列表 + txt=[] + gz=[] + for f in os.listdir(src): # os.listdir:用于返回一个文件名和目录名组成的列表 + if f.endswith(".txt"): # endswith:判断字符串是否以指定字符或子字符串结尾 + txt.append(f) + elif f.endswith(".gz"): + gz.append(f) + + # 创建文件夹 + if not os.path.isdir(dst1): + os.makedirs(dst1) + if not os.path.isdir(dst2): + os.makedirs(dst2) + + # 将文件拷贝到目标文件夹 + for t in txt: + _txt=os.path.join(src,t) # os.path.join:路径名与文件名合并,合并的一定是字符串格式 + shutil.copy(_txt,dst1) # shutil.copy:将文件拷贝到目标文件夹中 + for g in gz: + _gz=os.path.join(src,g) + shutil.copy(_gz,dst2) + + print("finish") + +if __name__ == '__main__': + base_filename=".\data(1)" + src=os.path.join(base_filename,"raw_data") + dst1=os.path.join(base_filename,"txt_data") + dst2=os.path.join(base_filename,"gz_data") + + splitFile(src,dst1,dst2)