-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_data.py
72 lines (48 loc) · 1.88 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
""" generate_data.py generates toy data for the Subtype-Oriented Disease Axes
(SODA) project. It generates two files:
"""
import numpy as np
import scipy.stats
np.random.seed(0)
n_features = 20
n_id = 100
n_class = 5
# Randomly generate classes and features.
class_ii = np.zeros([n_id, n_class], int)
class_ii[:, :n_class - 1] = 1 + np.random.choice(2, [n_id, n_class - 1], p = [.3, .7])
class_ii[:, -1] = 1 + np.random.choice(4, n_id, p = [.2, .2, .2, .4])
feature_ii = np.zeros([n_id, n_features])
feature_class = n_features // n_class
for cc in range(n_class):
unique_cc = np.unique(class_ii[:, cc])
for cccc in unique_cc:
Sigma = scipy.stats.wishart.rvs(feature_class, np.eye(feature_class))
mu = np.random.normal(0, size = 4)
n_cccc = (class_ii[:, cc] == cccc).sum()
feature_ii[class_ii[:, cc] == cccc, cc * feature_class : (cc + 1) * feature_class] = \
np.random.multivariate_normal(mu, Sigma, size = n_cccc)
# Randomly generate missing labels
missing = np.array(np.random.binomial(1, .1, size = [n_id, n_class]), bool)
class_ii[missing] = -1
# Save to the file cluster.csv
f = open("cluster.csv", "w")
header = "id" + (",Cluster{}" * n_class).format(
*[ii + 1 for ii in range(n_class)]
) + "\n"
f.write(header)
for ii in range(n_id):
f.write("XX{0:06d}".format(ii + 1))
f.write( ( (",{}" * n_class).format(*class_ii[ii, :]) ).replace("-1", "NA"))
f.write("\n")
f.close()
# Save to the file data.csv
f = open("data.csv", "w")
header = "id" + (",Feature{}" * n_features).format(
*[ii + 1 for ii in range(n_features)]
) + "\n"
f.write(header)
for ii in range(n_id):
f.write("XX{0:06d}".format(ii + 1))
f.write( (",{:.2f}" * n_features).format(*feature_ii[ii, :]) )
f.write("\n")
f.close()