-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathprocess_data.py
185 lines (141 loc) · 5.2 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""Process SCADA and downtime data
This script merges all four CSV files containing SCADA and downtime data into
single CSV files. Two files are older datasets, and the other two are newer
datasets. Both old and new datasets have most of their timestamps in common.
The older SCADA datasets were found to have some errors in the rotor speed
readings. This merging replaces the old errorred data points with the new
ones, and removes incomplete rows.
This merging ensures the downtime data has the same range as the SCADA data,
and removes incomplete rows.
"""
# import libraries
import os
import glob
import itertools
import pandas as pd
import numpy as np
# create directory to store processed data
os.makedirs("data/processed/", exist_ok=True)
# Downtime categories
# read and view data
data = pd.read_excel("data/Melogale Downtime Categories.xlsx")
# drop first row
data = data.drop([0])
# function to filter data for each category type
catData = {}
def categorise_data(cat, number):
catData[cat] = data.filter(
items=[cat+" Categories", "Unnamed: "+str(number)]
)
catData[cat].rename(
columns={
cat+" Categories": "Category", "Unnamed: "+str(number): "Name"
},
inplace=True
)
catData[cat]["Type"] = cat
catData[cat].dropna(inplace=True)
# filtering
categorise_data("Turbine", 1)
categorise_data("Environmental", 3)
categorise_data("Grid", 5)
categorise_data("Infrastructure", 7)
categorise_data("Availability", 9)
# concatenate data
data = pd.concat(catData.values(), ignore_index=True)
# save data as CSV
data.to_csv("data/processed/downtime_categories.csv", index=False)
# Downtime time series
dt = {}
dtList = glob.glob("data/*downtime*.csv")
for num, df in enumerate(dtList):
dt[num] = pd.read_csv(df)
print(num, df, dt[num].shape)
# drop duplicate columns
dt[1].drop(columns=list(dt[1].filter(regex=".1")), inplace=True)
# convert timestamps to datetime data type
for key in dt.keys():
for col in list(dt[key].filter(regex="timestamp")):
dt[key][col] = pd.to_datetime(dt[key][col])
# concatenate data
data = pd.concat(dt.values(), join="outer")
# sort by timestamp, then by turbine
data = data.sort_values(["timestamp_start", "timestamp_end", "turbine_id"])
# reset index after sort
data.reset_index(drop=True, inplace=True)
# save dataframe as new CSV
data.to_csv(
"data/processed/downtime_timeseries.csv", index=False, encoding="utf-8"
)
# SCADA time series
# old SCADA data
scada = {}
scadaList = glob.glob("data/*SCADA.csv")
for num, df in enumerate(scadaList):
scada[num] = pd.read_csv(df)
print(num, df, scada[num].shape)
# rename ap_min in scada[1] to ap_max
scada[1].rename(columns={"ap_min": "ap_max"}, inplace=True)
# fixing rotor speed readings due to errors in data
def fix_rs(c):
if c["turbine"] <= 20:
return c["rs_av"]
for df in scada.keys():
scada[df]["rs_av_old"] = scada[df].apply(fix_rs, axis=1)
scada[df] = scada[df].drop("rs_av", axis=1)
# concatenate old datasets
scadaOld = pd.concat(scada.values())
# new SCADA data
scada = {}
scadaList = glob.glob("data/NS_SCADA*.csv")
for num, df in enumerate(scadaList):
scada[num] = pd.read_csv(df)
print(num, df, scada[num].shape)
# concatenate new datasets
scadaNew = pd.concat(scada.values())
# convert timestamp to datetime data type
scadaNew["timestamp"] = pd.to_datetime(scadaNew["timestamp"])
scadaOld["timestamp"] = pd.to_datetime(scadaOld["timestamp"], dayfirst=True)
# rename columns
scadaNew.rename(columns={"turbine_id": "turbine"}, inplace=True)
scadaNew.rename(columns={"rs_av": "rs_av_new"}, inplace=True)
# sort values and drop duplicates
scadaNew = scadaNew.sort_values(["timestamp", "turbine"])
scadaOld = scadaOld.sort_values(["timestamp", "turbine"])
scadaNew = scadaNew.drop_duplicates(["timestamp", "turbine"], keep="first")
scadaOld = scadaOld.drop_duplicates(["timestamp", "turbine"], keep="first")
# fill missing rows in time series
tmstmp = list(pd.date_range(
"2014-11-01 00:00:00", "2017-04-30 23:50:00", freq="10min"
))
# list of turbines -- 1 to 25, for each of the 25 turbines
trbn = range(1, 26)
newcols = list(itertools.product(tmstmp, trbn))
cols = pd.DataFrame(newcols)
cols.columns = ["timestamp", "turbine"]
scadaNew["ap_max"] = scadaNew["ap_max"].astype(np.float64)
scadaOld_cols = pd.merge(
scadaOld, cols, on=["timestamp", "turbine"], how="outer"
)
# merge old and new data by these columns
scada = pd.merge(scadaOld_cols, scadaNew, on=[
"timestamp", "turbine", "ws_av", "wd_av", "ws_1", "ws_2", "wd_1",
"wd_2", "gen_sp", "pitch", "reactive_power", "ap_max", "ap_dev",
"ap_av", "nac_pos"
], how="outer")
# sort and drop duplicates again
scada = scada.sort_values(["timestamp", "turbine"])
scada = scada.drop_duplicates(["timestamp", "turbine"], keep="first")
# merge rotor speed readings
def merge_rotor_readings(c):
if c["rs_av_new"] >= 0:
return c["rs_av_new"]
else:
return c["rs_av_old"]
scada["rs_av"] = scada.apply(merge_rotor_readings, axis=1)
# drop old columns and reset index
scada = scada.drop("rs_av_old", axis=1)
scada = scada.drop("rs_av_new", axis=1)
scada.reset_index(drop=True, inplace=True)
# write to new CSV file
scada.to_csv("data/processed/SCADA_timeseries.csv", index=False)