-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgen_sim_data.py
116 lines (76 loc) · 2.28 KB
/
gen_sim_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
""" generate simulated data
"""
import numpy as np
__author__ = "Yifeng Tao"
def gen_F(n_comp=3, n_samp=100):
""" generate fraction matrix F
"""
# https://math.stackexchange.com/questions/502583/uniform-sampling-of-points-on-a-simplex
F = np.random.uniform(size=(n_comp,n_samp))
F = -np.log(F)
F_sum = np.sum(F, axis=0)
F = F/F_sum
return F
def gen_C(n_gene, n_comp, n_path, sig):
""" generate clean component matrix C (noise not added yet)
"""
gpg = n_gene // n_path
mu = np.array([0]*gpg)
r = 0.95*np.ones((gpg,gpg))+np.diag([0.05]*gpg)
#r = np.array([
# [ 1, 0.95, 0.95, 0.95, 0.95],
# [ 0.95, 1, 0.95, 0.95, 0.95],
# [ 0.95, 0.95, 1, 0.95, 0.95],
# [ 0.95, 0.95, 0.95, 1, 0.95],
# [ 0.95, 0.95, 0.95, 0.95, 1]
# ])
C = [sig*np.random.multivariate_normal(mu, r, size=3).T+6.0 for _ in range(n_path)]
C = np.vstack(C)
C = 2**C
return C
def gen_noise(n_gene, n_samp, f=0.1):
""" generate noise matrix
"""
noise = np.random.normal(size=(n_gene, n_samp))*f
noise = 2**noise
sgn = 2*np.random.randint(0,2,(n_gene, n_samp))-1
noise = sgn*noise
return noise
def gen_sim_data(n_samp=100, n_comp=3, n_gene=2048, n_modu=64, f=4.0):
""" generate simulated data
Parameters
----------
n_samp: int
number of bulk samples
n_comp: int
number of cell components/populations
n_gene: int
number of genes to be considered
n_modu: int
number of gene modules, the number of genes per module is n_gene//n_modu
f: float
noise level
Returns
-------
Cgt: 2D array of float
ground truth C without noise
Fgt: 2D array of float
ground truth F withou noise
B: 2D array of float
noisy given bulk data
module: list of list of int
each sublist contains indices of genes in the same gene module
"""
gpg = int(n_gene/n_modu)
n_gene = n_modu*gpg
sig = 2.5
# ground truth C and F
Cgt = gen_C(n_gene, n_comp, n_modu, sig)
Fgt = gen_F(n_comp=n_comp, n_samp=n_samp)
noise = gen_noise(n_gene, n_samp, f=f)
# noisy bulk data B
B = np.clip(np.dot(Cgt, Fgt) + noise, 0, None)
module = [
[idx_m*gpg+idx_g for idx_g in range(gpg)]
for idx_m in range(n_modu)]
return Cgt, Fgt, B, module