-
Notifications
You must be signed in to change notification settings - Fork 5
/
processing_data.py
127 lines (94 loc) · 4.68 KB
/
processing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
import os
from dataset import WholeDataset
def load_data(file_fir):
try:
df_raw = pd.read_csv(file_fir, index_col='Date')
except IOError:
print("IO ERROR")
return df_raw
def costruct_data_warehouse(ROOT_PATH, file_names, predict_day, seq_len):
number_of_stocks = 0
data_warehouse = {}
for stock_file_name in file_names:
file_dir = os.path.join(ROOT_PATH, stock_file_name)
## Loading Data
try:
df_raw = load_data(file_dir)
except ValueError:
print("Couldn't Read {} file".format(file_dir))
number_of_stocks += 1
data = df_raw
df_name = data['Name'][0]
del data['Name']
target = (data['Close'][predict_day:] / data['Close'][:-predict_day].values).astype(int)
data = data[:-predict_day]
target.index = data.index
# Becasue of using 200 days Moving Average as one of the features
data = data[200:]
data = data.fillna(0)
data['target'] = target
target = data['target']
del data['target']
number_feature = data.shape[1]
samples_in_each_stock = data.shape[0]
train_data = data[data.index < '2016-04-21']
train_data1 = scale(train_data)
train_target1 = target[target.index < '2016-04-21']
train_data = train_data1[:int(0.75 * train_data1.shape[0])]
train_target = train_target1[:int(0.75 * train_target1.shape[0])]
valid_data = scale(train_data1[int(0.75 * train_data1.shape[0]) - seq_len:])
valid_target = train_target1[int(0.75 * train_target1.shape[0]) - seq_len:]
data = pd.DataFrame(scale(data.values), columns=data.columns)
data.index = target.index
test_data = data[data.index >= '2016-04-21']
test_target = target[target.index >= '2016-04-21']
data_warehouse[df_name] = [train_data, train_target, np.array(test_data), np.array(test_target), valid_data,
valid_target]
return data_warehouse, number_of_stocks, number_feature, samples_in_each_stock
def cnn_data_sequence_separately(tottal_data, tottal_target, data, target, seque_len):
for index in range(data.shape[0] - seque_len + 1):
tottal_data.append(data[index: index + seque_len])
tottal_target.append(target[index + seque_len - 1])
return tottal_data, tottal_target
def cnn_data_sequence(data_warehouse, seq_len):
tottal_train_data = []
tottal_train_target = []
tottal_valid_data = []
tottal_valid_target = []
tottal_test_data = []
tottal_test_target = []
for key, value in data_warehouse.items():
tottal_train_data, tottal_train_target = cnn_data_sequence_separately(tottal_train_data, tottal_train_target,
value[0], value[1], seq_len)
tottal_test_data, tottal_test_target = cnn_data_sequence_separately(tottal_test_data, tottal_test_target,
value[2], value[3], seq_len)
tottal_valid_data, tottal_valid_target = cnn_data_sequence_separately(tottal_valid_data, tottal_valid_target,
value[4], value[5], seq_len)
tottal_train_data = np.array(tottal_train_data)
tottal_train_target = np.array(tottal_train_target)
tottal_test_data = np.array(tottal_test_data)
tottal_test_target = np.array(tottal_test_target)
tottal_valid_data = np.array(tottal_valid_data)
tottal_valid_target = np.array(tottal_valid_target)
return tottal_train_data, tottal_train_target, tottal_test_data, tottal_test_target, tottal_valid_data, tottal_valid_target
def transforming_data_warehouse(data_warehouse, order_stocks, seq_len):
transformed_data_loader = {}
for name in order_stocks:
value = data_warehouse[name]
train = WholeDataset(*cnn_data_sequence_pre_train(value[0], value[1], seq_len))
test = WholeDataset(*cnn_data_sequence_pre_train(value[2], value[3], seq_len))
validation = WholeDataset(*cnn_data_sequence_pre_train(value[4], value[5], seq_len))
transformed_data_loader[name] = [train, test, validation]
return transformed_data_loader
def cnn_data_sequence_pre_train(data, target, seque_len):
new_data = []
new_target = []
for index in range(data.shape[0] - seque_len + 1):
new_data.append(data[index: index + seque_len])
new_target.append(target[index + seque_len - 1])
new_data = np.array(new_data)
new_target = np.array(new_target)
return new_data, new_target