-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvolcano.py
135 lines (120 loc) · 5.35 KB
/
volcano.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
class Volcano(object):
#e.g. usage
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
# when using jupyter notebook:
# %matplotlib inline
def __init__(self, ratio, p_val, label=None, s_curve_x_axis_overplot=0.5, s_curve_y_axis_overplot=0.5):
"""
careful use ratio not difference as in Perseus
ratio of 0.5 instead of difference -2
:param ratio: Pandas.Series or Numpy.Array or List of log2(ratios)
:param p_val: Pandas.Series or Numpy.Array or List of -log10(p-values)
:param label: Pandas.Series or Numpy.Array or ListOfString
"""
assert len(ratio) == len(p_val)
self.df = pd.DataFrame({"ratio": ratio, "p_val": p_val})
if label is not None:
self.df["label"] = label
self.s_curve_y_axis_overplot = s_curve_y_axis_overplot
self.p_val_cutoff = self.get_p_val_cutoff()
self.ratio_cutoff = self.get_ratio_cutoff()
self.df["s_val"] = self.df.apply(self.calc_s_from_row, axis=1)
self.ratio_for_s = pd.Series(np.linspace(self.df["ratio"].min() - s_curve_x_axis_overplot, self.df["ratio"].max() + s_curve_x_axis_overplot, num=1000))
self.p_for_s_larger_1 = self.ratio_for_s.apply(self.calc_p_for_s_equals_1)
def get_p_val_cutoff(self):
"""
p_val_cutoff = 0.05
pc = 3.5 + median(p_val(50% lowest log2_ratios)) --> is what Jan uses for whatever reason ???
-log10_pval of 2.0 --> pval of 0.01
"""
### hard coded cutoff of 1%
# return math.log(0.01, 10) * -1
### Jan's cutoff, but how to justify???
quant = self.df["ratio"].quantile(0.5)
return 2.0 + self.df.loc[self.df["ratio"] < quant, "p_val"].median()
def get_ratio_cutoff(self):
"""
log2_ratio_cutoff = 2.0
ratio_cutoff_high = 2 + median(ratio(50% lowest log10_p_values))
ratio_cutoff_low = 0.5 - median(ratio(50% lowest log10_p_values))
"""
### hard coded cutoff of 2 fold enrichment or depletion
# return math.log(0.5, 2), math.log(2, 2)
### Jan's cutoff, how to justify???
quant = self.df["p_val"].quantile(0.5)
median_ = self.df.loc[self.df["p_val"] < quant, "ratio"].median()
ratio_cutoff_high = 2.0 + median_
ratio_cutoff_low = -2.0 - median_
return ratio_cutoff_low, ratio_cutoff_high
def calc_s_from_row(self, row):
p_val = row["p_val"]
ratio = row["ratio"]
return self.calc_s(p_val, ratio)
def calc_s(self, p_val, ratio):
"""
so the algorithmn for finding stuff with s > 1 is:
discard stuff below the ratio_cutoff
discard stuff below the p-val cutoff
do the calcuation for the stuff above BOTH cutoffs and accept all with s > 1
s = (p_val - p_val_cutoff) * (ratio - ratio_cutoff)
:param p_val: Float(-log10 p-value)
:param ratio: Float(log2 ratio)
:return: Float
"""
ratio_delta = 0
ratio_cutoff_low, ratio_cutoff_high = self.ratio_cutoff
if ratio > 0:
ratio_delta = ratio - ratio_cutoff_high
if ratio_delta < 0:
return 0
elif ratio < 0:
ratio_delta = ratio - ratio_cutoff_low
if ratio_delta > 0:
return 0
ratio_delta = abs(ratio_delta)
p_val_delta = p_val - self.p_val_cutoff
if p_val_delta < 0:
return 0
return p_val_delta * ratio_delta
def calc_p_for_s_equals_1(self, ratio):
"""
:param ratio: Float(log2 ratio)
:return: Float
"""
ratio_cutoff_low, ratio_cutoff_high = self.ratio_cutoff
ratio_delta_high = ratio - ratio_cutoff_high
ratio_delta_low = ratio - ratio_cutoff_low
if ratio > ratio_cutoff_high:
return (1.0 / ratio_delta_high) + self.p_val_cutoff
elif ratio < ratio_cutoff_low:
return (1.0 / (ratio_delta_low * -1)) + self.p_val_cutoff
else:
return np.nan
def get_fig(self, title="Volcano plot", s_value_cutoff=1.0):
fig, ax1 = plt.subplots(figsize=(12, 12))
ax1.set_title(title, fontsize=22, fontweight="bold")
cond_p_val = self.p_for_s_larger_1 <= (self.df["p_val"].max() + self.s_curve_y_axis_overplot)
cond_pos = self.ratio_for_s > 0
x1 = self.ratio_for_s[cond_pos & cond_p_val]
y1 = self.p_for_s_larger_1[cond_pos & cond_p_val]
cond_neg = self.ratio_for_s < 0
x2 = self.ratio_for_s[cond_neg & cond_p_val]
y2 = self.p_for_s_larger_1[cond_neg & cond_p_val]
x3 = self.df["ratio"]
y3 = self.df["p_val"]
ax1.plot(x1, y1, 'r-', x2, y2, 'r-', x3, y3, 'b.', markersize=10) # , alpha=0.7)
ax1.set_xlabel('log2(ratio)', fontsize=20)
ax1.set_ylabel('-log10(p-value)', fontsize=20)
if "label" in self.df.columns:
cond = self.df["s_val"] >= s_value_cutoff
for index_, row in self.df[cond].iterrows():
label = row["label"]
x_coord = row["ratio"]
y_coord = row["p_val"]
ax1.annotate(label, xy=(x_coord, y_coord), xycoords='data', xytext=(5, 5),
textcoords='offset points', arrowprops=dict(arrowstyle="-"), fontsize=12)
return fig