-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_sim.py
202 lines (159 loc) · 5.81 KB
/
gen_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# %% [markdown]
# # Libraries
# from Utils.data import Physio3
# general
import numpy as np
import pandas as pd
import argparse
# import custom libraries
import sys
import os
import tqdm
import pickle
import yaml
# %%
# plotly
import plotly.express as px # (version 4.7.0 or higher)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
# %%
# folder paths
PATH_RAW = "./raw/"
PATH_PROCESSED = "./processed/"
PATH_YAML = "../configs/data/"
# create folder if not exists
os.makedirs(PATH_RAW, exist_ok=True)
os.makedirs(PATH_PROCESSED, exist_ok=True)
# %%
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
# %%
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
def generate_poisson_binary_vector(length, lambda_value):
# Generate random counts from a Poisson distribution
poisson_counts = np.random.poisson(lambda_value, length)
# Convert counts to a binary vector
binary_vector = np.where(poisson_counts > 0, 1, 0)
return binary_vector
def main():
global DATASET
for n_vars in opt.n_vars:
for lambda_value in opt.lambdas:
# %%
# Set random seed for reproducibility
np.random.seed(42)
# Parameters
# n_vars = 25
# lambda_value = 2
n_samples = 10000
n_timestamps = 128
period_min = 16
period_max = 48
TARGET_DIM = 128 if n_vars == 128 else 64
# TARGET_DIM = n_vars
GRAN = 1
# Generate sinusoidal data with random phases
data = np.zeros((n_samples, n_timestamps, n_vars))
periods = np.random.randint(period_min, period_max + 1, n_vars)
for var in range(n_vars):
phase = np.random.uniform(0, 2 * np.pi, 1)
for sample in range(n_samples):
t = np.arange(n_timestamps)
amplitude = np.random.uniform(0.9, 1, 1)
noise = np.random.normal(0, 0.1, n_timestamps)
baseline = np.random.uniform(-0.5, 0.5, 1)
noise_period = np.random.randint(1, 5)
noise_phase = np.random.uniform(0, 2 * np.pi / 50, 1)
data[sample, :, var] = (
baseline
+ amplitude
* np.sin(2 * np.pi * t / (periods[var] + noise_period) + phase)
+ noise
)
# Simulate random missing values based on a Poisson process
missing_mask = (
np.array(
[
generate_poisson_binary_vector(n_timestamps, lambda_value)
for i in range(n_samples * n_vars)
]
)
.reshape(n_samples, n_vars, n_timestamps)
.transpose(0, 2, 1)
)
mr = (missing_mask == 0).sum() / missing_mask.size * 100
print(f"missing rate: {mr:.2f}%")
# apply missing mask
data_with_missing = np.where(missing_mask == 0, np.nan, data)
# Create a plot using Plotly
# You may need to install plotly via pip if you haven't already: pip install plotly
sample_idx = 0 # Choose a sample to plot
fig = go.Figure()
for var in range(5):
_ = fig.add_trace(
go.Scatter(
x=np.arange(n_timestamps),
y=data_with_missing[sample_idx, :, var],
mode="lines",
name=f"Variable {var+1}",
)
)
_ = fig.update_layout(
title="Simulated Multivariate Time Series with Missing Data",
xaxis_title="Time Steps",
yaxis_title="Value",
)
# fig.show()
data.shape
data_with_missing.shape
# %%
# convert to dataframe
col_names = [f"var_{i}" for i in range(n_vars)]
df = pd.DataFrame(data_with_missing.reshape(-1, n_vars), columns=col_names)
df.to_csv(PATH_RAW + f"sim-l{lambda_value}-d{n_vars}.csv", index=False)
yaml_dict = {
"name": DATASET,
"path_raw": f"./data/raw",
"path_processed": f"./data/processed/sim-l{lambda_value}-d{n_vars}",
"img_size": TARGET_DIM,
"granularity": 1,
"n_split": 1,
"seed": 42,
}
with open(PATH_YAML + f"sim-l{lambda_value}-d{n_vars}.yaml", "w") as file:
yaml.dump(yaml_dict, file)
if __name__ == "__main__":
DATASET = "sim"
parser = argparse.ArgumentParser(description="Data Preprocessing")
# # parser.add_argument('--dataset', type=str, default='p12', help='dataset', dest="DATASET" )
# parser.add_argument('--d', type=int, default=32, help='number of variables', dest="n_vars")
# parser.add_argument('--lambda', type=float, default=0.5, help='number of variables', dest="lambda_value")
parser.add_argument(
"--n-vars",
nargs="+",
help="A list of integers.",
type=int,
default=[32],
dest="n_vars",
)
parser.add_argument(
"--lambdas",
nargs="+",
help="A list of floats.",
type=float,
default=[0.5],
dest="lambdas",
)
opt = parser.parse_args()
print(opt.n_vars)
print(opt.lambdas)
# lambda_value = opt.lambda_value
# n_vars = opt.n_vars
# DATASET = f'sim-l{lambda_value}-d{n_vars}'
# SUFFLE_VARS = False
pass
main()