-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
executable file
·107 lines (86 loc) · 3.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from datetime import datetime, timedelta
import sys
import pandas as pd
import numpy as np
import json
def do_import(filename):
df = _get_data_frame(filename)
df = _clean_data(df)
data = _get_dataset(df)
metadata = {
'updated': datetime.now().date().isoformat(),
}
formatted = {
'metadata': metadata,
'chart_data': data,
}
_write_file(formatted, 'all-data.json')
data['datasets'] = _filter_minor_dams(data['datasets'])
_write_file(formatted, 'main-data.json')
def _get_data_frame(filename):
return pd.read_csv(filename, encoding="ISO-8859-1")
def _clean_data(df):
return df.replace(np.nan, '', regex=True)
def _get_dataset(df):
dates = _load_dates(df)
data = _load_data(df)
return {
'labels': dates,
'datasets': data,
}
def _write_file(data, filename):
with open('front-end/{}'.format(filename), 'w') as f:
f.write(json.dumps(data, indent=2))
def _filter_minor_dams(dam_data):
MINOR_DAMS = set(dam.lower() for dam in [
"Hely-Hutchinson",
"Woodhead",
"Victoria",
"Alexandra",
"De Villiers",
"KleinPlaats",
"Lewis Gay",
])
return list(filter(lambda data: data['label'].lower() not in MINOR_DAMS, dam_data))
def _load_dates(df):
KNOWN_BROKEN_DATES = {
datetime(2017, 8, 8): datetime(2017, 5, 8),
datetime(2018, 5, 19): datetime(2017, 5, 19),
datetime(2019, 5, 20): datetime(2017, 5, 20),
datetime(2020, 5, 21): datetime(2017, 5, 21),
datetime(2021, 5, 22): datetime(2017, 5, 22),
}
dates = df.iloc[:, 0][4:]
dates = pd.to_datetime(dates, dayfirst=True)
# dates = dates.apply(lambda x: x.to_
last_date = None
for i, date in enumerate(dates):
if last_date and date - last_date != timedelta(days=1):
if date in KNOWN_BROKEN_DATES:
date = KNOWN_BROKEN_DATES[date]
else:
assert False, 'date {} found anomalous data! {} is not the day after {}'.format(i, date, last_date)
last_date = date
return [str(d.date()) for d in dates.tolist() if d.date() <= datetime.today().date()]
def _load_data(df):
return [_parse_chunk(df, 4 * i + 1) for i in range(13)]
def _parse_chunk(df, start_index):
cols = 4
data = df.iloc[:, start_index:start_index + cols]
dam_name = data.iloc[1][0]
# hack: fix Voëlvlei which shows up with weird (and changing) encodings
if dam_name.startswith("VO"):
dam_name = 'Voëlvlei'.upper()
storage_data = data.iloc[:, 1][4:]
storage_data = storage_data.str.replace('\s+', '') # strip spaces
storage_data = pd.to_numeric(storage_data, errors='coerce')
storage_data = storage_data.replace(np.nan, None, regex=True) # replace NaN with previous value
return {
'label': dam_name,
'data': storage_data.tolist()
}
if __name__ == '__main__':
filename = 'data/Dam levels 2012 to 2021.csv'
if len(sys.argv) > 1:
filename = sys.argv[1]
do_import(filename)