-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdensity_plot_replicates_fitnessfx_v3.py
152 lines (114 loc) · 4.92 KB
/
density_plot_replicates_fitnessfx_v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import sys
import numpy as np
import copy
import pandas as pd
import seaborn as sns
paper_rc = {'lines.linewidth': 10}
sns.set_context("paper", rc = paper_rc,font_scale=5)
import matplotlib.pyplot as plt
#usage: python3 density_plot_python.py rep_data.txt y.test_test_results
#Parameters
rep_data='37C_2.0_1.1_coeffvarANDreadcutoff_uncollapsedMBA_unpaired_ratiocvfilt_10.0_3.0.filtered_gene_inserts'
stats_data='37C_2.0_1.1_coeffvarANDreadcutoff_uncollapsedMBA_unpaired_ratiocvfilt_10.0_3.0.mwu_test_results'
cutoff=False
cutoff_num = 10 #how many top genes to look at
specific_genes_to_include=['YGR098C','YMR168C','YHR023W','YLR397C']
def ParseFitness(rep_data):
'''get the replicates' fitness data'''
sp_dict={}
sc_dict={}
df=pd.read_csv(rep_data,sep='\t')
print(df.head())
print(df.columns)
fitness_ratio_rep_columns=[col for col in df.columns if '_n_av' in col and '39_28_log2' in col] # get columns with reps' data
#initialize each rep in dict for each allele:
empty_rep_dict={}
for col in fitness_ratio_rep_columns:
empty_rep_dict[col]=[]
all_genes=list(set(df['gene'].to_list()))
for gene in all_genes:
sc_dict[gene]=copy.deepcopy(empty_rep_dict)
sp_dict[gene]=copy.deepcopy(empty_rep_dict)
#add all observations
for index, row in df.iterrows():
allele=row['allele']
gene=row['gene']
if allele=='sc':
for rep_col in fitness_ratio_rep_columns:
## print(sc_dict[gene])
## print(sp_dict[gene])
sc_dict[gene][rep_col].append(row[rep_col])
## print(sc_dict[gene])
## print(sp_dict[gene])
elif allele=='sp':
for rep_col in fitness_ratio_rep_columns:
sp_dict[gene][rep_col].append(row[rep_col])
## print(sc_dict==sp_dict)
## print(sc_dict['YLR397C'])
## print(sp_dict['YLR397C'])
## print(df[df[gene]=='YLR397C'])
#collapse by replicate
for gene in sc_dict:
for rep_col in sc_dict[gene]:
sc_dict[gene][rep_col]=np.nanmean(sc_dict[gene][rep_col])
for gene in sp_dict:
for rep_col in sp_dict[gene]:
sp_dict[gene][rep_col]=np.nanmean(sp_dict[gene][rep_col])
#print(sc_dict==sc_dict)
return sp_dict,sc_dict
def ParseResults(stats_data, cutoff_num):
stats_dict={}
with open (stats_data) as myfile:
#get top # of genes
head = [next(myfile) for x in range(cutoff_num+1)]
for line in head[1:]:
row_data=line.split("\t")
gene = row_data[0]
adj_pval=float(row_data[2].strip('\n'))
stats_dict[gene]=adj_pval
#get specific genes
for line in myfile:
row_data=line.split("\t")
gene = row_data[0]
adj_pval=float(row_data[2].strip('\n'))
if gene in specific_genes_to_include:
stats_dict[gene]=adj_pval
return stats_dict
def PlotDensity(spar_ins, scer_ins,geneName):
figName=geneName+"_py_density_byreplicates.pdf"
fig=plt.figure(figsize=(20,15))
## sns.distplot(spar_ins, hist = False, kde = True, kde_kws = {'linewidth':3},
## label= 'Insert in paradoxus')
##
## sns.distplot(scer_ins, hist = False, kde = True, kde_kws = {'linewidth':3},
## label= 'Insert in cerevisiae')
sns.kdeplot(spar_ins, bw='silverman', kernel='gau',label= 'Insert in paradoxus',color='#08A5CD')
sns.kdeplot(scer_ins, bw='silverman', kernel='gau',label= 'Insert in cerevisiae',color='#F1B629')
plt.title(geneName,fontsize=80,loc='center')
plt.ylabel('Density of Replicates', fontsize=80)
plt.xlabel('Fitness Score', fontsize=80)
plt.rc('xtick',labelsize=80)
plt.rc('ytick',labelsize=80)
plt.savefig(figName, bbox_inches='tight', format='pdf',dpi=1000)
def PlotDensities(sp_dict,sc_dict,stats_dict=None,specific_genes=None):
if stats_dict!=None:
for gene in stats_dict:
PlotDensity(sp_dict[gene], sc_dict[gene],gene)
elif specific_genes!=None:
for gene in specific_genes:
if gene in sp_dict and gene in sc_dict:
PlotDensity(sp_dict[gene], sc_dict[gene],gene)
else:
print(gene+" is not in both alleles")
return 'done plotting'
##RUN###
sp_dict,sc_dict=ParseFitness(rep_data)
print(sp_dict)
if cutoff!=False:
stats_dict=ParseResults(stats_data,cutoff_num)
PlotDensities(sp_dict,sc_dict,stats_dict=stats_dict,specific_genes=specific_genes_to_include)
else:
#print(sp_dict)
#print(sc_dict['YHR023W'])
#print(sp_dict['YHR023W'])
PlotDensities(sp_dict,sc_dict,stats_dict=None,specific_genes=specific_genes_to_include)