-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathget_energy.py
109 lines (78 loc) · 4.03 KB
/
get_energy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Reading in the data, sklearn style...
The data contains the following column:
#DateTime Holiday HolidayFlag DayOfWeek WeekOfYear Day Month Year PeriodOfDay Fo recastWindProduction SystemLoadEA SMPEA ORKTemperature ORKWindspeed CO2Intensity ActualWindProduction SystemLoadEP2 SMPEP2
#DateTime and Holiday: is a string and subsumed by following features HolidayFlag: is Boolean and identicaly for each day DayOfWeek WeekOfYear Day Month Year: is discrete and identicaly for each day PeriodOfDay: is discrete 0..47 ORKTemperature ORKWindspeed: contains NAN and are questionable (actual values) ActualWindProduction SystemLoadEP2 SMPEP2: Actual values, with SMPEP2 the label
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# prep numpy arrays, Xs will contain groupID as first column
def get_energy(fname=None, trainTestRatio=0.70):
df = get_energy_pandas(fname)
length = df['groupID'].nunique()
grouplength = 48
# numpy arrays, X contains groupID as first column
X1g = df.loc[:, df.columns != 'SMPEP2'].values
y = df.loc[:, 'SMPEP2'].values
#no negative values allowed...for now I just clamp these values to zero. They occur three times in the training data.
for i in range(len(y)):
y[i] = max(y[i], 0)
# ordered split per complete group
train_len = int(trainTestRatio*length)
# the splitting
X_1gtrain = X1g[:grouplength*train_len]
y_train = y[:grouplength*train_len]
X_1gtest = X1g[grouplength*train_len:]
y_test = y[grouplength*train_len:]
#print(len(X1g_train),len(X1g_test),len(X),len(X1g_train)+len(X1g_test))
return (X_1gtrain, y_train, X_1gtest, y_test)
def get_energy_grouped(fname=None):
df = get_energy_pandas(fname)
# put the 'y's into columns (I hope this respects the ordering!)
t = df.groupby('groupID')['SMPEP2'].apply(np.array)
grpY = np.vstack(t.values) # stack into a 2D array
# now something similar but for the features... lets naively just take averages
grpX = df.loc[:, df.columns != 'SMPEP2'].groupby('groupID').mean().as_matrix()
# train/test splitting, sklearn is so convenient
(grpX_train, grpX_test, grpY_train, grpY_test) = \
train_test_split(grpX, grpY, test_size=0.3, shuffle=False)
return (grpX_train, grpY_train, grpX_test, grpY_test)
def get_energy_pandas(fname=None):
if fname == None:
fname = "prices2013.dat"
df = pd.read_csv(fname, delim_whitespace=True, quotechar='"')
# remove unnecessary columns
df.drop(['#DateTime', 'Holiday', 'ActualWindProduction', 'SystemLoadEP2'], axis=1, inplace=True)
# remove columns with missing values
df.drop(['ORKTemperature', 'ORKWindspeed'], axis=1, inplace=True)
# missing value treatment
# df[pd.isnull(df).any(axis=1)]
# impute missing CO2 intensities linearly
df.loc[df.loc[:,'CO2Intensity'] == 0, 'CO2Intensity'] = np.nan # an odity
df.loc[:,'CO2Intensity'].interpolate(inplace=True)
# remove remaining 3 days with missing values
grouplength = 48
for i in range(0, len(df), grouplength):
day_has_nan = pd.isnull(df.loc[i:i+(grouplength-1)]).any(axis=1).any()
if day_has_nan:
#print("Dropping",i)
df.drop(range(i,i+grouplength), inplace=True)
# data is sorted by year, month, day, periodofday; don't want learning over this
df.drop(['Day', 'Year', 'PeriodOfDay'], axis=1, inplace=True)
# insert group identifier at beginning
grouplength = 48
length = int(len(df)/48) # 792
gids = [gid for gid in range(length) for i in range(grouplength)]
df.insert(0, 'groupID', gids)
return df
if __name__ == '__main__':
df = get_energy_pandas()
print(df.head())
(X_1gtrain, y_train, X_1gtest, y_test) = get_energy()
print([len(x) for x in (X_1gtrain, y_train, X_1gtest, y_test)])
### Options to try for learning:
# split DayOfWeek into Weekday/Weekend, perhaps even split up days
# split up Month into seasons
# do use ORK*s but with missing value imputation
# remove WeekOfYear?