人类各染色体缺失/重复统计柱状图（matplotlib）

原创已于 2024-07-15 19:04:57 修改 · 455 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#matplotlib #python #数据分析

于 2024-05-17 13:36:47 首次发布

该博客介绍了如何利用Python的matplotlib库，绘制人类24条染色体的缺失和重复统计柱状图。首先，定义了读取数据文件的函数，然后合并绘图所需的数据，接着绘制柱状图，并最终展示结果。

绘制人类各条染色体缺失/重复统计柱状图。

读取数据文件函数

import pandas as pd
import matplotlib as plt

def read_file(file_path: str):
    if file_path.endswith(('.tsv', '.txt')):
        # 缺失值填充为NA
        return pd.read_csv(file_path, sep='\t').fillna('NA') if os.path.exists(file_path) else pd.DataFrame()
    elif file_path.endswith(('.xls', '.xlsx')):
        return pd.read_excel(file_path).fillna('NA') if os.path.exists(file_path) else pd.DataFrame()
    else:
        raise Exception("ERROR FILE FORMAT")

合并绘图数据

def merge_statistics(outdir='./'):
		# 读取文件
    dataframe1=read_file(file_path=outdir + "data1.tsv")
    dataframe2=read_file(file_path=outdir + "data2.tsv")
    
    # data1.tsv data2.tsv 数据格式
    # chr1    chr2    chr3    ... chr22   chrX    chrY
    # sample1 sample1 sample3 ... sample4 sample1 sample1
    # sample2 sample1 sample3 ... sample4 sample1 sample2
    
    # 补充缺失chrom列数据
    for chrom in list_chrom_all:
        if chrom not in list(dataframe1.columns):
            dataframe1[chrom] = ['NA'] * len(dataframe1)
        if chrom not in list(dataframe2.columns):
            dataframe2[chrom] = ['NA'] * len(dataframe2)

    # 全部计数列表，染色体缺失类型计数列表，染色体重复类型计数列表，赋值为24个0
    list_all_values, list_del_values, list_dup_values = [0] * 24, [0] * 24, [0] * 24
    for index, chrom in enumerate(list_chrom_all):
    
        # 各类型分别计数，不为NA则+1
        for idx, row in dataframe1.iterrows():
            text = row[chrom]
            if text != 'NA':
                list_del_values[index] += 1
                
        for idx, row in dataframe2.iterrows():
            text = row[chrom]
            if text != 'NA':
                list_dup_values[index] += 1     
    
    # del列表
    print("######### del #########")
    print(list_del_values)
    
    # dup列表
    print("######### dup #########")
    print(list_dup_values)
    
     # all列表， 累加2各列表数值
    list_all_values = [n1+n2 for n1, n2 in zip(list_del_values, list_dup_values)]
    print(list_all_values)

    return list_all_values, list_del_values, list_dup_values

绘制24条染色体的柱状图

# 为柱状图柱子加上数据标签
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        if height > 0:
            plt.annotate('{0}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 0.8),  #柱子上方距离
                        textcoords="offset points",
                        ha='center', va='bottom')


# 各染色体缺失/重复统计柱状图
# bar_width 柱子宽度
# coeff 宽度系数
def plot_each_chrom(outdir='./', bar_width = 0.35, coeff=1.5):
    categories = list_chrom_all
       
    # 获取返回值
    values_series_1, values_series_2, values_series_3 = merge_statistics()
    
    # 创建柱状图
    plt.figure(figsize=(12,5))
    index = np.arange(len(categories))
    
    bar1 = plt.bar(index * coeff, values_series_1, bar_width, label='All')
    
    bar2 = plt.bar(index * coeff + bar_width, values_series_3, 
    bar_width, label='Dup', color='#C93838' )
    
    bar3 = plt.bar(index * coeff + bar_width * 2, values_series_2, 
    bar_width, label='Del', color='#315B96')
    
    # 创建元组
    list_categoires = tuple([categories[i] for i in range(0, 24)])
    # ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 
    # 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY')
    
    plt.xlabel("Chromosome")
    plt.ylabel('Count')
    plt.xticks (index * coeff + bar_width / 2, list_categoires)

    plt.title("Chromosome statistics")
    plt.xticks(index * coeff + bar_width / 2, fontsize=9, rotation=45)

    autolabel(bar1)
    autolabel(bar2)
    autolabel(bar3)
    
    plt.legend()

    # 保存柱状图
    plt.savefig(outdir + 'staistics_bar.png')
    plt.show()