-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
228 lines (198 loc) · 8.37 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
configfile: "config/params.yaml"
configfile: "config/config.yaml"
# Convention for files: Use the following pattern:
# something_with_underscores_prefix-WILDCARD_another_prefix-ANOTHER_WILDCARD
# that means:
# - underscores to separate words
# - a prefix word before wildcards
# - a dash between the prefix and the wildcard
# - no dashes or underscores in wildcards
# - no other dashes in file names
#
# This should make the Snakemake filename pattern matching pretty unambiguous. Feel free to define
# additional wildcard_constraints.
wildcard_constraints:
year="\d+",
month="\d+",
technology="[a-z]+",
renewable_scenario="[a-z0-9A-Z]+",
# testmode is a fast run with test data, see config/config.yaml
data_dir = "data" + ("-test" if config["testmode"] else "") + "/"
# Crazy hack: make the snakemake config object globally available to python code in src.
# see src/snakemake_config.py for details
import src.snakemake_config
src.snakemake_config.config = config
# this might allow parallel cdsapi downloads on the VSC, one per node
# XXX but it might be a bit too much to use a separate node just to download some files, right?
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#resources
resource_scopes:
cdsapi="local",
# TODO do we need to add source files as input files to trigger the computation? Snakemake should
# keep track of it, but somehow it doesn't, right? --list-code-changes does not show changes.
# TODO do we need to add some of the config options as parameters to the rules?
rule all:
localrule: True
input:
expand(
data_dir + "output/network_solution/network_solution_renewables-{renewable_scenario}.nc",
renewable_scenario=config['renewable_params'].keys(),
)
rule download_land_sea_mask:
localrule: True
output:
data_dir + "input/land_sea_mask/land_sea_mask.nc"
run:
from src.download import download_land_sea_mask
download_land_sea_mask(input, output)
rule download_era5:
output:
# note: this is set to protected because we probably want to download it only once, but
# removing it accidentally is kind of a PITA.
protected(data_dir + "input/era5/global-{year}-{month}.nc"),
# Speed here is probably mostly limited by network throughput, so more cores won't help much.
# At the same time, /tmp might run out of space easily if we download too much at the same
# time. We could change the location of temporary files, but we would have to take care about
# removing them afterwards. Weirdly, atlite does not remove temp files if the location is not
# the default one.
# Also the download bar looks ugly on the screen if there are multiple downloads at once.
# Therefore we require this rule to use 1 unit of the resource cdsapi and set 1 available
# cdsapi resource in the profiles for nora and VSC in the profiles in config/*.
# TODO this does work any longer, but why? Jobs do run in parallel again! git blame can help?
resources:
cdsapi=1
run:
from src.download import download_era5
download_era5(
inputs=input,
outputs=output,
year=wildcards.year,
month=wildcards.month,
)
rule generate_renewable_timeseries:
input:
# note: this task also downloads inputs if necessary
era5 = data_dir + "input/era5/global-{year}-{month}.nc",
output:
# remove the temp() here if you want to keep the monthly time series, probably makes only
# sense if you are experimenting with one month only.
renewable_timeseries = temp(
data_dir + "interim/{technology}/" +
"{technology}_renewables-{renewable_scenario}-month_{year}-{month}.nc"),
resources:
mem="22GB", # measured 20.1GB (PV) and 14.8GB (wind) on nora
runtime="15min", # 3min for PV and 24s for wind on nora
run:
from src.renewable_timeseries import generate_renewable_timeseries
generate_renewable_timeseries(
input,
output,
technology=wildcards.technology,
year=wildcards.year,
month=wildcards.month,
renewable_params=config['renewable_params'][wildcards.renewable_scenario],
)
rule concat_renewable_timeseries:
# concatenates monthly files to a yearly file
input:
expand(
rules.generate_renewable_timeseries.output,
# This list comprehension here is a workaround for:
# https://github.com/snakemake/snakemake/issues/2470
# The formatting :02d does not work for inputs, so we add the zero padding here.
month=[f"{m:02d}" for m in range(1, 13)],
allow_missing=True
)
output:
data_dir + "interim/renewable_timeseries/"
"{technology}_renewables-{renewable_scenario}_{year}.nc",
resources:
mem="75GB", # measured 68.1GB on nora (for both, PV and wind)
runtime="20min", # 9min on nora (for both, PV and wind)
run:
from src.renewable_timeseries import concat_renewable_timeseries
concat_renewable_timeseries(
inputs=input,
outputs=output,
technology=wildcards.technology
)
rule optimize_network:
input:
download_land_sea_mask = rules.download_land_sea_mask.output,
pv = expand(
rules.concat_renewable_timeseries.output,
year=config['year_era5'],
technology=['pv'],
allow_missing=True,
),
wind = expand(
rules.concat_renewable_timeseries.output,
year=config['year_era5'],
technology=['wind'],
allow_missing=True,
),
output:
# TODO rename this to chunks
# TODO make this temporary files?
network_solution = data_dir + "interim/network_solution/network_solution_renewables-{renewable_scenario}_chunk-{x_idx}-{y_idx}.nc"
wildcard_constraints:
x_idx="\d+",
y_idx="\d+",
resources:
runtime="20min", # 3min on nora, but multiple parallel jobs might slow down things
mem="8GB", # less than a GB on nora, but make it larger (we had weird RAM issues on the VSC?)
run:
from src.optimize import optimize_network_chunk
optimize_network_chunk(
inputs=input,
outputs=output,
pv_timeseries_fname=input.pv[0],
wind_timeseries_fname=input.wind[0],
x_start_idx=int(wildcards.x_idx),
y_start_idx=int(wildcards.y_idx),
chunk_size=config['chunk_size'],
time_period_h=config['time_period_h'],
)
rule plot_network:
# plot the graph of the network to a PNG file
# note that this is file is not run automatically, but only if you run this rule explicitly:
#
# ./run.sh plot_network
input:
wind = expand(
rules.optimize_network.input.wind,
renewable_scenario=list(config['renewable_params'].keys())[0],
),
pv = expand(
rules.optimize_network.input.pv,
renewable_scenario=list(config['renewable_params'].keys())[0],
),
output:
data_dir + "output/network.png",
run:
from src.plot_network import plot_network
plot_network(
inputs=input,
outputs=output,
pv_timeseries_fname=input.pv[0],
wind_timeseries_fname=input.wind[0],
)
rule concat_solution_chunks:
input:
expand(
rules.optimize_network.output.network_solution,
x_idx=range(*config['x_idx_from_to'], config['chunk_size'][0]),
y_idx=range(*config['y_idx_from_to'], config['chunk_size'][1]),
allow_missing=True,
)
output:
# minor duplication: also in rules.all.input, because rule "all" needs to be the first one
# in the Snakefile to be executed as default rule. At the same time we can refer only to
# rules defined above already. So we cannot refer to an output file in rule "all" from
# another rule.
data_dir + "output/network_solution/network_solution_renewables-{renewable_scenario}.nc",
resources:
runtime="60s", # 12s measured on nora
mem="2GB", # 640MB measured on nora
run:
from src.optimize import concat_solution_chunks
concat_solution_chunks(input, output)