-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTraining.py
531 lines (398 loc) · 19.6 KB
/
Training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 12 15:26:01 2023
@author: ozanbaris
"""
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lmfit import minimize, Parameters
os.environ['MKL_VERBOSE'] = '0'
from sklearn.linear_model import Ridge
def read_csvs_to_dict(main_output_directory, mode):
# Ensure mode is either 'cooling' or 'heating'
if mode not in ['cooling', 'heating']:
raise ValueError("Mode must be either 'cooling' or 'heating'.")
# Determine the column name based on the mode
runtime_column = "CoolingRunTime" if mode == 'cooling' else "HeatingRunTime"
all_houses_reduced = {}
# Iterate through each subdirectory in the main directory
for house_group_folder in os.listdir(main_output_directory):
house_group_path = os.path.join(main_output_directory, house_group_folder)
if os.path.isdir(house_group_path):
house_group = house_group_folder.replace("house_group_", "")
all_houses_reduced[int(house_group)] = {}
# Iterate through each CSV file in the subdirectory
for csv_file in os.listdir(house_group_path):
if csv_file.endswith('.csv'):
house_id = csv_file.replace("house_id_", "").replace(".csv", "")
csv_path = os.path.join(house_group_path, csv_file)
df = pd.read_csv(csv_path)
# Rename the specified runtime column to "RunTime"
df.rename(columns={runtime_column: "RunTime"}, inplace=True)
sensor_motion_names = ['Thermostat_DetectedMotion'] + [f'RemoteSensor{i}_DetectedMotion' for i in range(1, int(house_group)+1)]
df[sensor_motion_names] = df[sensor_motion_names].fillna(0)
# Add the DataFrame to the nested dictionary
all_houses_reduced[int(house_group)][house_id] = df
return all_houses_reduced
main_output_directory = "house_data_csvs1"
mode = "cooling" # or "heating"
all_houses_reduced = read_csvs_to_dict(main_output_directory, mode)
#%%
# Random average across a randomly selected subset of sensors
def random_average(row):
# Exclude NaN values to only consider available sensors
available_sensors = row[sensor_columns].dropna()
if available_sensors.empty:
# If no sensors are available, use Thermostat_Temperature as fallback
return row['Thermostat_Temperature']
# Randomly choose one or more sensors
selected_sensors = available_sensors.sample(n=np.random.randint(1, len(available_sensors)+1))
return selected_sensors.mean()
# Worst-case scenario: alternating between min and max sensor readings
def worst_case_average(idx, row):
available_sensors = row[sensor_columns].dropna()
if available_sensors.empty:
return row['Thermostat_Temperature']
if idx % 2 == 0:
return available_sensors.min()
else:
return available_sensors.max()
def compute_motion_average(house_dict, num_sensors):
# Define sensor names
sensor_motion_names = ['Thermostat_DetectedMotion'] + [f'RemoteSensor{i}_DetectedMotion' for i in range(1, num_sensors + 1)]
sensor_temp_names = ['Thermostat_Temperature'] + [f'RemoteSensor{i}_Temperature' for i in range(1, num_sensors + 1)]
for house_id, dataset in house_dict.items():
# Create a function to compute motion-based average for a row
def motion_based_avg(row):
occupied_temp_values = [row[temp] for temp, motion in zip(sensor_temp_names, sensor_motion_names) if row[motion] == 1]
if occupied_temp_values:
return sum(occupied_temp_values) / len(occupied_temp_values)
else:
return row['Thermostat_Temperature']
# Apply the function to each row to compute the 'motion_average' column
dataset['motion_average'] = dataset.apply(motion_based_avg, axis=1)
house_dict[house_id] = dataset
return house_dict
def compute_averages(dataframe):
"""
Inner function to compute and add average columns to a single DataFrame.
"""
# Average across all sensors
dataframe['average_all'] = dataframe[sensor_columns].mean(axis=1)
# Random average across a randomly selected subset of sensors
dataframe['random_average'] = dataframe.apply(random_average, axis=1)
# Worst-case scenario average
dataframe['worst_case_average'] = [worst_case_average(idx, row) for idx, row in dataframe.iterrows()]
return dataframe
# Apply compute_averages to each DataFrame in all_houses_reduced
for sensor_count, houses in all_houses_reduced.items():
# Define the sensor columns
sensor_columns = ['Thermostat_Temperature'] + [f'RemoteSensor{i}_Temperature' for i in range(1, sensor_count + 1)]
sensor_motion_names = ['Thermostat_DetectedMotion'] + [f'RemoteSensor{i}_DetectedMotion' for i in range(1, sensor_count + 1)]
for house_id, house_data in houses.items():
if house_data.empty:
print(f"Skipping empty DataFrame for {house_id}")
continue
all_houses_reduced[sensor_count][house_id]=compute_averages(house_data)
for sensor_count, houses in all_houses_reduced.items():
all_houses_reduced[sensor_count] = compute_motion_average(houses, sensor_count)
#%%
def ridge_regression(X, y, alpha=1.0):
ridge = Ridge(alpha=alpha, fit_intercept=False) # Set fit_intercept to False
ridge.fit(X, y)
return ridge.coef_
def process_houses(houses_dict, sensor_count):
models = {}
train_errors = {}
test_errors = {}
singular_houses = []
results={}
# Define the sensor columns
sensor_columns = ['random_average','average_all','worst_case_average','motion_average','Indoor_AverageTemperature','Thermostat_Temperature'] + [f'RemoteSensor{i}_Temperature' for i in range(1, sensor_count + 1)]
for house_id, data in houses_dict.items():
print("Processing house:", house_id)
house_models = {}
house_train_errors = {}
house_test_errors = {}
for sensor_col in sensor_columns:
print("Processing house:", house_id, ", sensor:", sensor_col)
data['RunTime'].fillna(0, inplace=True)
# Features
X = data[['RunTime', 'Outdoor_Temperature','GHI']].values[:-1]
sensor_values = data[sensor_col].values[:-1].reshape(-1, 1)
X = np.hstack([X, sensor_values])
# Target
y = data[sensor_col].values[1:]
feature_cols = ['RunTime', 'Outdoor_Temperature','GHI', sensor_col]
# Check for NaN values in feature columns
for col in feature_cols:
if data[col].isna().any():
print(f"House: {house_id}, Sensor: {sensor_col}, Column '{col}' has NaN values!")
print(data[sensor_col])
# Check if there's data left after filtering
if X.shape[0] == 0:
print("No valid data for house:", house_id, ", sensor:", sensor_col)
continue # Skip the current loop iteration
# Split the data into halves
half_index = int(0.5 * len(X))
first_half_X = X[:half_index]
first_half_y = y[:half_index]
# Within the first half, split into training and testing
split_index = int(0.875 * len(first_half_X))
X_train = first_half_X[:split_index]
X_test = first_half_X[split_index:]
y_train = first_half_y[:split_index]
y_test = first_half_y[split_index:]
theta = ridge_regression(X_train, y_train)
house_models[sensor_col] = theta
# No need to add bias to X_train and X_test
# Directly use them for predictions
train_predictions = X_train.dot(theta)
test_predictions = X_test.dot(theta)
train_error = np.sqrt(np.mean((y_train - train_predictions) ** 2))
test_error = np.sqrt(np.mean((y_test - test_predictions) ** 2))
house_train_errors[sensor_col] = train_error
house_test_errors[sensor_col] = test_error
models[house_id] = house_models
train_errors[house_id] = house_train_errors
test_errors[house_id] = house_test_errors
# Store results
results[house_id] = {
'models': house_models,
'train_errors': house_train_errors,
'test_errors': house_test_errors
}
return results
results_onestep = {}
for sensor_count, house_dict in all_houses_reduced.items():
results_onestep[sensor_count]=process_houses(house_dict, sensor_count)
print("Results:", results_onestep)
#%%
def create_regressor(outdoor_temp, sensor_temp, cooling_runtime,solar_data, h, na, nb, nd, k, P):
"""
Creates a regressor Z for the given inputs.
Uses recursively predicted sensor_temp values beyond the kth time step.
"""
Z = []
#print("Lengths of inputs at iteration {}:".format(i))
#print("cooling_runtime:", len(cooling_runtime))
#print("outdoor_temp:", len(outdoor_temp))
#print("solar_data:", len(solar_data))
#print("k:", k)
#print("i:", i)
# print("k+i:", k+i)
# Start with the actual sensor_temp at time k
predicted_values = [sensor_temp[k]]
#print('predcited vals', predicted_values)
for i in range(P):
z = [[cooling_runtime[k+i]]+[outdoor_temp[k+i]]+[solar_data[k+i]]+[predicted_values[-1]]]
#print('z',z)
print(f"Iteration {i}, Z length: {len(Z)}, predicted_values length: {len(predicted_values)}")
Z.append(z)
z = np.array(z).flatten()
# Compute the next predicted value using the current regressor
y_pred = np.dot(z, h)
#if i<20:
# print('in time k:',k,'i',i,'for Z:', z,'y_pred:',y_pred)
predicted_values.append(y_pred)
print("Shape of Z:", np.array(Z).shape)
return np.array(Z)
def objective(params, outdoor_temp, sensor_temp,cooling_runtime, solar_data,y_true, P, N):
h = np.array([params['h0'], params['h1'], params['h2'],params['h3']])
residuals = []
for k in range(N - P):
# Get the regressor Z for the current k value
Z = create_regressor(outdoor_temp, sensor_temp, cooling_runtime, solar_data, h, 1, 1, 1, k, P)
for i in range(1,P):
y_pred = np.dot(Z[i], h)
residual = y_pred - y_true[k+i]
residuals.append(residual)
return residuals
def identify_parameters(outdoor_temp, sensor_temp, cooling_runtime, solar_data, y_true, P, init_guess, N):
params = Parameters()
params.add('h0', value=init_guess[0])
params.add('h1', value=init_guess[1])
params.add('h2', value=init_guess[2])
params.add('h3', value=init_guess[3])
# Call the minimize function. Note that we don't need the intermediate Z anymore.
result = minimize(objective, params, args=(outdoor_temp, sensor_temp, cooling_runtime,solar_data, y_true, P, N))
h = np.array([result.params['h0'].value, result.params['h1'].value, result.params['h2'].value, result.params['h3'].value])
residuals = result.residual # Access the residuals from the minimize result
return h, residuals
def compute_error(Z, sensor_temp, h, P):
residuals = []
N = len(sensor_temp) - 1
for k in range(N - P):
for i in range(1, P+1):
y_pred = np.dot(Z[k + i - 1], h)
residual = y_pred - sensor_temp[k + i-1]
residuals.append(residual)
return np.mean(np.array(residuals)**2) # Return the Mean Squared Error (MSE)
def compute_rmse(errors):
"""Compute the root mean squared error from a list of errors."""
mse = np.mean(np.array(errors)**2)
return np.sqrt(mse)
"""
def compute_sensor_rmse(X, y, identified_params, P):
residuals = []
N = len(X) - P
for k in range(N):
Z = create_regressor(X[:, 1], X[:, 3], X[:, 0], X[:, 2],identified_params, 1, 1, 1, k, P)
for i in range(1, P):
y_pred = np.dot(Z[i - 1], identified_params)
residual = y_pred - y[k + i]
residuals.append(residual)
return compute_rmse(residuals)
"""
def compute_sensor_rmse(X, y, identified_params, P):
residuals = []
N = len(X) - P
for k in range(N):
Z = create_regressor(X[:, 1], X[:, 3], X[:, 0], X[:, 2], identified_params, 1, 1, 1, k, P)
# Only use the prediction at point P
y_pred = np.dot(Z[P - 1], identified_params)
residual = y_pred - y[k + P]
residuals.append(residual)
return compute_rmse(residuals)
#%%
def process_houses_genetic_algorithm(houses_dict, sensor_count, initial_parameters):
results_lm = {}
# Define the sensor columns
sensor_columns = ['random_average','average_all','worst_case_average','motion_average','Indoor_AverageTemperature','Thermostat_Temperature'] + [f'RemoteSensor{i}_Temperature' for i in range(1, sensor_count + 1)]
training_errors = {}
testing_errors = {}
for house_id, data in houses_dict.items():
house_models = {}
house_training_errors = {}
house_testing_errors = {}
print('house id: ', house_id)
for sensor_col in sensor_columns:
print(f"Processing sensor: {sensor_col}")
# Target
y = data[sensor_col].values[1:]
# Check if y has only nan values
print(f"y has {np.isnan(y).sum()} nan values out of {len(y)}")
## Replace nan values in 'CoolingEquipmentStage1_RunTime' with 0
data['RunTime'].fillna(0, inplace=True)
X = data[['RunTime', 'Outdoor_Temperature','GHI']].values[:-1]
sensor_values = data[sensor_col].values[:-1].reshape(-1, 1)
X = np.hstack([X, sensor_values])
# Check shapes and sample values
#print(f"X shape: {X.shape}, y shape: {y.shape}")
#print(f"Sample X values: {X[:5]}, Sample y values: {y[:5]}")
# Split the data into halves
half_index = int(0.5 * len(X))
first_half_X = X[:half_index]
first_half_y = y[:half_index]
# Within the first half, split into training and testing
split_index = int(0.875 * len(first_half_X))
X_train = first_half_X[:split_index]
X_test = first_half_X[split_index:]
y_train = first_half_y[:split_index]
y_test = first_half_y[split_index:]
# Check shapes and sample values after splitting
#print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
#print(f"Sample X_train values: {X_train[:5]}, Sample y_train values: {y_train[:5]}")
# Prediction horizon
P = 24
# Check if there are initial parameters for current sensor_col
if sensor_col not in initial_parameters[house_id]:
print(f"No initial parameters found for house {house_id} sensor {sensor_col}. Skipping...")
continue
init_guess = initial_parameters[house_id][sensor_col]
print(f"Initial guess for {sensor_col}: {init_guess}")
print('training started for', sensor_col)
identified_params, residuals = identify_parameters(X_train[:, 1], X_train[:,3], X_train[:, 0], X_train[:,2], y_train, P, init_guess, len(X_train))
print('Identified parameters:', identified_params)
# Compute RMSE for training and testing data
train_rmse = compute_sensor_rmse(X_train, y_train, identified_params, P)
test_rmse = compute_sensor_rmse(X_test, y_test, identified_params, P)
print(f"Training RMSE for {sensor_col}: {train_rmse}")
print(f"Testing RMSE for {sensor_col}: {test_rmse}")
house_training_errors[sensor_col] = train_rmse
house_testing_errors[sensor_col] = test_rmse
house_models[sensor_col] = identified_params
training_errors[house_id] = house_training_errors
testing_errors[house_id] = house_testing_errors
# Store results
results_lm[house_id] = {
'models': house_models,
'train_errors': house_training_errors,
'test_errors': house_testing_errors
}
return results_lm
#%%
initial_params = {}
for idx, house in results_onestep.items():
for house_id, result_dict in house.items():
initial_params[house_id]=result_dict['models']
#%%
results_lm={}
for sensor_count, houses in all_houses_reduced.items():
results_lm[sensor_count]=process_houses_genetic_algorithm(houses, sensor_count, initial_params)
#%%
def print_house_counts(results_dict, description):
print(f"House counts for {description}:")
for sensor_count, houses in results_dict.items():
# Extracting unique house IDs from the dictionary
house_ids = houses.keys()
unique_house_count = len(set(house_ids))
print(f"Sensor Count {sensor_count}: {unique_house_count} houses")
print_house_counts(results_lm, "Long-term Memory Results")
print_house_counts(results_onestep, "One-Step Results")
print_house_counts(all_houses_reduced,'all_houses')
#%%
house_counts = {}
for sensor_count in results_lm:
house_counts[sensor_count] = len(results_lm[sensor_count])
# Sorting the dictionary by sensor count
sorted_house_counts = dict(sorted(house_counts.items()))
# Extracting the sensor counts and corresponding house numbers
sensor_counts = list(sorted_house_counts.keys())
number_of_houses = list(sorted_house_counts.values())
# Plotting
plt.figure(figsize=(10, 6))
bars = plt.bar(sensor_counts, number_of_houses, color='salmon')
# Adding the number of houses on top of each bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval-30, yval, ha='center', va='bottom', fontsize=20, fontweight='bold')
# Setting labels and title
plt.xlabel('Number of Additional Sensors', fontsize=20)
plt.ylabel('Number of Houses', fontsize=20)
#plt.title('Distribution of number of sensors among the houses trained', fontsize=16)
plt.xticks(fontsize=20)
# Remove y-axis line and ticks
#plt.gca().spines.set_visible(False)
plt.tick_params(axis='y', which='both', left=False, labelleft=False)
plt.savefig('trained_sensors.pdf', bbox_inches='tight')
# Show plot
plt.tight_layout()
plt.show()
#%%
def extract_errors_by_sensor_count(results_dict):
"""
Extract training and testing errors by sensor count from results dictionary.
Args:
- results_dict (dict): Dictionary containing results data for either results_onestep or results_lm.
Returns:
- tuple: training_errors_by_sensor_count, testing_errors_by_sensor_count
"""
training_errors_by_sensor_count = {}
testing_errors_by_sensor_count = {}
# Loop through the results dictionary to collect training and testing errors
for sensor_count, results in results_dict.items():
training_errors = {}
testing_errors = {}
# Loop through each house and its results
for house_id, house_results in results.items():
training_errors[house_id] = house_results['train_errors']
testing_errors[house_id] = house_results['test_errors']
training_errors_by_sensor_count[sensor_count] = training_errors
testing_errors_by_sensor_count[sensor_count] = testing_errors
return training_errors_by_sensor_count, testing_errors_by_sensor_count
training_errors_onestep, testing_errors_onestep = extract_errors_by_sensor_count(results_onestep)
training_errors_lm, testing_errors_lm = extract_errors_by_sensor_count(results_lm)