绘制人类各条染色体缺失/重复统计柱状图。
读取数据文件函数
import pandas as pd
import matplotlib as plt
def read_file(file_path: str):
if file_path.endswith(('.tsv', '.txt')):
# 缺失值填充为NA
return pd.read_csv(file_path, sep='\t').fillna('NA') if os.path.exists(file_path) else pd.DataFrame()
elif file_path.endswith(('.xls', '.xlsx')):
return pd.read_excel(file_path).fillna('NA') if os.path.exists(file_path) else pd.DataFrame()
else:
raise Exception("ERROR FILE FORMAT")
合并绘图数据
def merge_statistics(outdir='./'):
# 读取文件
dataframe1=read_file(file_path=outdir + "data1.tsv")
dataframe2=read_file(file_path=outdir + "data2.tsv")
# data1.tsv data2.tsv 数据格式
# chr1 chr2 chr3 ... chr22 chrX chrY
# sample1 sample1 sample3 ... sample4 sample1 sample1
# sample2 sample1 sample3 ... sample4 sample1 sample2
# 补充缺失chrom列数据
for chrom in list_chrom_all:
if chrom not in list(dataframe1.columns):
dataframe1[chrom] = ['NA'] * len(dataframe1)
if chrom not in list(dataframe2.columns):
dataframe2[chrom] = ['NA'] * len(dataframe2)
# 全部计数列表,染色体缺失类型计数列表,染色体重复类型计数列表,赋值为24个0
list_all_values, list_del_values, list_dup_values = [0] * 24, [0] * 24, [0] * 24
for index, chrom in enumerate(list_chrom_all):
# 各类型分别计数,不为NA则+1
for idx, row in dataframe1.iterrows():
text = row[chrom]
if text != 'NA':
list_del_values[index] += 1
for idx, row in dataframe2.iterrows():
text = row[chrom]
if text != 'NA':
list_dup_values[index] += 1
# del列表
print("######### del #########")
print(list_del_values)
# dup列表
print("######### dup #########")
print(list_dup_values)
# all列表, 累加2各列表数值
list_all_values = [n1+n2 for n1, n2 in zip(list_del_values, list_dup_values)]
print(list_all_values)
return list_all_values, list_del_values, list_dup_values
绘制24条染色体的柱状图
# 为柱状图柱子加上数据标签
def autolabel(rects):
for rect in rects:
height = rect.get_height()
if height > 0:
plt.annotate('{0}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 0.8), #柱子上方距离
textcoords="offset points",
ha='center', va='bottom')
# 各染色体缺失/重复统计柱状图
# bar_width 柱子宽度
# coeff 宽度系数
def plot_each_chrom(outdir='./', bar_width = 0.35, coeff=1.5):
categories = list_chrom_all
# 获取返回值
values_series_1, values_series_2, values_series_3 = merge_statistics()
# 创建柱状图
plt.figure(figsize=(12,5))
index = np.arange(len(categories))
bar1 = plt.bar(index * coeff, values_series_1, bar_width, label='All')
bar2 = plt.bar(index * coeff + bar_width, values_series_3,
bar_width, label='Dup', color='#C93838' )
bar3 = plt.bar(index * coeff + bar_width * 2, values_series_2,
bar_width, label='Del', color='#315B96')
# 创建元组
list_categoires = tuple([categories[i] for i in range(0, 24)])
# ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
# 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY')
plt.xlabel("Chromosome")
plt.ylabel('Count')
plt.xticks (index * coeff + bar_width / 2, list_categoires)
plt.title("Chromosome statistics")
plt.xticks(index * coeff + bar_width / 2, fontsize=9, rotation=45)
autolabel(bar1)
autolabel(bar2)
autolabel(bar3)
plt.legend()
# 保存柱状图
plt.savefig(outdir + 'staistics_bar.png')
plt.show()
结果图:

执行主行
plot_each_chrom()
该博客介绍了如何利用Python的matplotlib库,绘制人类24条染色体的缺失和重复统计柱状图。首先,定义了读取数据文件的函数,然后合并绘图所需的数据,接着绘制柱状图,并最终展示结果。
3939

被折叠的 条评论
为什么被折叠?



