-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCMIP6_create_ensembles.py
193 lines (160 loc) · 9.26 KB
/
CMIP6_create_ensembles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# %%
from xclim import ensembles
from pathlib import Path
import xarray as xr
from CMIP6_IO import CMIP6_IO
import os
import dask
import flox
from dask.distributed import Client
import numpy as np
# ### Create ensemble files from the outputs of running CMIP6_light.py and from forcing files
#
# This script will loop over all files found in a speficic folder `lightpath` and find files of the same variable and scenario to create ensemble files from. The various variables to create ensembles for include `["uvb", "uv", "uvi", "par", "uva", "tos", "siconc", "sithick", "tas", "sisnthick", "sisnconc", "uas", "vas"]`.
#
# The output is stored under folder `ensemble_path` and the results are used to create the final plots of modeled lightlyfor the paper using notebooks `CMIP6_plot_light_results.ipynb` and `CMIP6_calculate_MPIESM2.ipynb`.
#
# This script may require substantial RAM depending on the region you are creating ensemble for. For datasets covering the Northern Hemisphere we used a 64CPU and 120GB RAM machine.
#
# Define what models, scenarios, and variables to calculate ensemble files from
source_ids = ["MPI-ESM1-2-HR","MPI-ESM1-2-LR", "CanESM5", "UKESM1-0-LL"]
# The weighting file is created using CMIP6_create_weights.ipynb together with
# the ClimWIP package. The weights are calculated for variable tos using ERSSTv5 and CORA5.2 as observations
# to compare with. The weights are calculated for the period 1993-2020.
weights_file = "data/calculated_weights_18092024.nc"
create_ensemble = True
scenarios = ["ssp245","ssp585"]
var_names = ["sithick", "tas", "sisnthick", "sisnconc", "uas", "vas"]
var_names = ["tos", "uvb", "uv", "uva", "siconc", "sithick", "tas", "sisnthick", "sisnconc", "uas", "vas"]
var_names = ["uvb", "ghi", "osa","uv", "uvb_srf", "uva", "uvi", "osa", "ghi"]
var_names=["uv_srf"]
root="light/ncfiles_nobias"
percentiles = [2.5, 50.0, 97.5]
create_ensemble_files = True
io = CMIP6_IO()
def get_weights(weights_file: str, f: str):
weights = xr.open_dataset(weights_file)
weights = weights["weights"]
models = weights["model_ensemble"]
for source_id in source_ids:
if source_id in f:
for d, w in zip(models.values, weights.values):
if source_id in d.split("_")[0]:
print(f"Found model in models {source_id} here {d} weight {w}")
return w
def create_ensemble_files(scenarios, var_names):
max_models = 3000
only_upload = False
for use_weights in [True, False]:
for var_name in var_names:
for scenario in scenarios:
counter=0
ds_list = []
ensemble_stats = None
ensemble_perc = None
if var_name in ["prw", "clt", "tos", "siconc", "sithick", "tas", "sisnthick", "sisnconc", "uas", "vas"]:
lightpath = f"light/{scenario}"
ensemble_path = "light/ensemble/{scenario}"
else:
lightpath = f"{root}/{scenario}"
ensemble_path = f"{root}/{scenario}/ensemble"
if use_weights:
ensemble_stats = f"{ensemble_path}/CMIP6_ensemble_stats_{scenario}_{var_name}_weighted.nc"
ensemble_perc = f"{ensemble_path}/CMIP6_ensemble_perc_{scenario}_{var_name}_weighted.nc"
else:
ensemble_stats = f"{ensemble_path}/CMIP6_ensemble_stats_{scenario}_{var_name}.nc"
ensemble_perc = f"{ensemble_path}/CMIP6_ensemble_perc_{scenario}_{var_name}.nc"
if not os.path.exists(ensemble_path):
os.makedirs(ensemble_path, exist_ok=True)
assert (
ensemble_stats is not None
), "Unable to identify correct variable name to create output filename"
assert (
ensemble_perc is not None
), "Unable to identify correct variable name to create output filename"
if not only_upload:
if os.path.exists(ensemble_stats):
os.remove(ensemble_stats)
if os.path.exists(ensemble_perc):
os.remove(ensemble_perc)
current = f"{lightpath}"
file_on_gcs = io.list_dataset_on_gs(current)
# Loop over all files and filter on the models defined in source_ids.
# For each model read the corresponding weight from the weights file. We
# will use these values to create a weights xr.Dataarray to be used to weight each
# model when calculating the ensemble mean, std, max, min, and percentiles.
models_weights = []
model_names = []
for f in file_on_gcs:
var_name_mod = var_name
if var_name not in ["prw", "clt", "tos", "siconc", "sithick", "tas", "sisnthick", "sisnconc", "uas", "vas"]:
var_name_mod = f"{var_name}_"
# Filter to only use the models we prefer: and "uv_srf" not in f
if any(model in f for model in source_ids) and var_name_mod in f and scenario in f and "weight" not in f and "rsus" not in f and "rsds" not in f:
if counter >= max_models:
pass
else:
if var_name=="osa" and f"{scenario}/{var_name}_" not in f:
pass
else:
if var_name=="uvb" and "uvb_srf" in f:
pass
else:
ds = io.open_dataset_on_gs(f).persist()
models_weights.append(get_weights(weights_file, f))
model_names.append(f)
# Drop variable mask as we dont need it and it causes problems when
# calculating the ensemble mean.
if "mask" in ds.variables:
ds = ds.drop("mask")
ds_list.append(ds)
counter+=1
# Create the final xr.DataArray with the weights.
d = {
"dims": "realization",
"data": np.array(models_weights),
"name": "weights",
"models": model_names,
}
print(f"Number of datasets found {len(ds_list)} weights {d}")
weights = xr.DataArray.from_dict(d)
for i, ds in enumerate(ds_list):
duplicates = ds['time'].to_index().duplicated()
if duplicates.any():
print(f"Dataset {i} has {duplicates.sum()} duplicate time values for model {model_names[i]}")
print(f"Creating ensemble for {var_name} and scenario {scenario}")
if var_name in ["tos"]:
ens = ensembles.create_ensemble(ds_list, resample_freq="MS")
else:
ens = ensembles.create_ensemble(ds_list) #, resample_freq="MS", calendar="noleap")
if use_weights:
# if not only_perc:
ens_stats = ensembles.ensemble_mean_std_max_min(ens, weights=weights)
ens_perc = ensembles.ensemble_percentiles(
ens, values=percentiles, split=False, weights=weights)
else:
#if not only_perc:
ens_stats = ensembles.ensemble_mean_std_max_min(ens)
ens_perc = ensembles.ensemble_percentiles(
ens, values=percentiles, split=False)
# Save to file and upload to GCS.
#if not only_upload:
# if not only_perc:
# if not only_perc:
ens_stats.to_netcdf(ensemble_stats)
ens_perc.to_netcdf(ensemble_perc)
if Path(ensemble_stats).exists():
io.upload_to_gcs(ensemble_stats)
print(f"Created ensemble {ensemble_stats}")
if Path(ensemble_perc).exists():
io.upload_to_gcs(ensemble_perc)
print(f"Created ensemble {ensemble_perc}")
# if not only_perc:
if Path(ensemble_stats).exists():
Path(ensemble_stats).unlink()
if Path(ensemble_perc).exists():
Path(ensemble_perc).unlink()
if __name__ == '__main__':
dask.config.set(**{"array.slicing.split_large_chunks": False})
with Client() as client: # set up local cluster on your laptop
create_ensemble_files(scenarios, var_names)