From 37655cac035a47cbbbc6d4a1bc245f92918b4e22 Mon Sep 17 00:00:00 2001 From: Ove Date: Mon, 22 Apr 2024 13:10:07 +0200 Subject: [PATCH 1/5] colocation climatology use dict rather than bool to enbale it --- pyaerocom/colocation.py | 54 +++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/pyaerocom/colocation.py b/pyaerocom/colocation.py index dd4348341..6df2a0922 100644 --- a/pyaerocom/colocation.py +++ b/pyaerocom/colocation.py @@ -393,13 +393,22 @@ def check_ts_type(data, ts_type): def _colocate_site_data_helper( - stat_data, stat_data_ref, var, var_ref, ts_type, resample_how, min_num_obs, use_climatology_ref + start_data: StationData, + start_data_ref: StationData, + var: str, + var_ref: str, + ts_type: str, + resample_how: dict or str, + min_num_obs: int, + use_climatology_kwargs: dict = None, ): """ Helper method that colocates two timeseries from 2 StationData objects Used in main loop of :func:`colocate_gridded_ungridded` + + Parameters ---------- stat_data : StationData @@ -419,8 +428,13 @@ def _colocate_site_data_helper( to aggregate from hourly to daily, rather than the mean. min_num_obs : int or dict, optional minimum number of observations for resampling of time - use_climatology_ref : bool - if True, climatological timeseries are used from observations + use_climatology_kwargs : dict + if provided, climatology is calculated from obs data following the + kwargs provided in this dict. e.g {'resample_how': 'mean', + 'set_year': 2000, 'min_num_obs': 10, + 'clim_mincount': 10, + 'clim_freq': 'monthly'} + climatology is only calculed use_climatology_kwargs is not None Raises ------ @@ -440,8 +454,11 @@ def _colocate_site_data_helper( var, ts_type=ts_type, how=resample_how, min_num_obs=min_num_obs, inplace=True )[var] - if use_climatology_ref: - obs_ts = stat_data_ref.calc_climatology(var_ref, min_num_obs=min_num_obs)[var_ref] + if isinstance(use_climatology_ref, dict): + clim_min_obs = use_climatology_kwargs.get('min_num_obs', None) + if clim_min_obs is None: + use_climatology_kwargs['min_num_obs'] = min_num_obs + obs_ts = stat_data_ref.calc_climatology(var_ref, **use_climatology_kwargs)[var_ref] else: obs_ts = stat_data_ref.resample_time( var_ref, ts_type=ts_type, how=resample_how, min_num_obs=min_num_obs, inplace=True @@ -452,7 +469,14 @@ def _colocate_site_data_helper( def _colocate_site_data_helper_timecol( - stat_data, stat_data_ref, var, var_ref, ts_type, resample_how, min_num_obs, use_climatology_ref + stat_data: StationData, + stat_data_ref: StationData, + var: str, + var_ref: str, + ts_type: str, + resample_how: dict or str, + min_num_obs: int, + use_climatology_kwargs: dict ): """ Helper method that colocates two timeseries from 2 StationData objects @@ -482,8 +506,8 @@ def _colocate_site_data_helper_timecol( to aggregate from hourly to daily, rather than the mean. min_num_obs : int or dict, optional minimum number of observations for resampling of time - use_climatology_ref : bool - if True, NotImplementedError is raised + use_climatology_kwargs: dict + if provided, NotImplementedError is raised Raises ------ @@ -498,7 +522,7 @@ def _colocate_site_data_helper_timecol( dataframe containing the colocated input data (column names are data and ref) """ - if use_climatology_ref: + if use_climatology_kwargs is not None: raise NotImplementedError( "Using observation climatology in colocation with option " "colocate_time=True is not available yet ..." @@ -587,7 +611,7 @@ def colocate_gridded_ungridded( update_baseyear_gridded=None, min_num_obs=None, colocate_time=False, - use_climatology_ref=False, + use_climatology_kwargs=None, resample_how=None, **kwargs, ): @@ -649,8 +673,12 @@ def colocate_gridded_ungridded( if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. - use_climatology_ref : bool - if True, climatological timeseries are used from observations + use_climatology_kwargs : dict + if provided, climatology is calculated from obs data following the + kwargs provided in this dict. e.g {'resample_how': 'mean', + 'set_year': 2000, 'min_num_obs': 10, + 'clim_mincount': 10, + 'clim_freq': 'monthly'} resample_how : str or dict string specifying how data should be aggregated when resampling in time. Default is "mean". Can also be a nested dictionary, e.g. @@ -864,7 +892,7 @@ def colocate_gridded_ungridded( ts_type=col_freq, resample_how=resample_how, min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, + use_climatology_kwargs=use_climatology_kwargs, ) # this try/except block was introduced on 23/2/2021 as temporary fix from From e12f3e186aabe14a8475a1032250d3e08f3b648f Mon Sep 17 00:00:00 2001 From: Ove Date: Mon, 22 Apr 2024 13:10:39 +0200 Subject: [PATCH 2/5] parse dict to configure climatology obs calculation --- pyaerocom/colocation_auto.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index d03f967c3..d64f3460b 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -116,9 +116,10 @@ class ColocationSetup(BrowseDict): obs_data_dir : str, optional location of obs data. If None, attempt to infer obs location based on obs ID. - obs_use_climatology : bool - BETA if True, pyaerocom default climatology is computed from observation - stations (so far only possible for unrgidded / gridded colocation). + obs_use_climatology_kwargs : dict + dictionary with keyword arguments that are passed used when caluclating + climatology for obs data. Climatology is only calculated if obs_use_climatology_kwargs + is provided. obs_vert_type : str AeroCom vertical code encoded in the model filenames (only AeroCom 3 and later). Specifies which model file should be read in case there are @@ -354,7 +355,7 @@ def __init__( self.obs_name = None self.obs_data_dir = None - self.obs_use_climatology = False + self.obs_use_climatology_kwargs= None self._obs_cache_only = False # only relevant if obs is ungridded self.obs_vert_type = None @@ -1487,7 +1488,7 @@ def _prepare_colocation_args(self, model_var: str, obs_var: str): args.update( ts_type=ts_type, var_ref=obs_var, - use_climatology_ref=self.obs_use_climatology, + obs_use_climatology_kwargs=self.obs_use_climatology_kwargs, ) else: ts_type = self._get_colocation_ts_type(model_data.ts_type, obs_data.ts_type) From b17c50096c03e1a838faa94b5d966055d81a8864 Mon Sep 17 00:00:00 2001 From: Ove Date: Mon, 22 Apr 2024 13:13:16 +0200 Subject: [PATCH 3/5] update tests for changes made in collocation auto --- tests/test_colocation_auto.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index ac023f13a..7e635d28e 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -25,7 +25,7 @@ "save_coldata": False, "obs_name": None, "obs_data_dir": None, - "obs_use_climatology": False, + "obs_use_climatology_kwargs": None, "_obs_cache_only": False, "obs_vert_type": None, "obs_ts_type_read": None, @@ -241,7 +241,7 @@ def test_Colocator_run_gridded_gridded(tm5_aero_stp): dict( model_use_vars={"od550aer": "abs550aer"}, model_use_climatology=True, - obs_use_climatology=True, + obs_use_climatology_kwargs={'clim_freq':'monthly'}, ), "abs550aer", "od550aer", @@ -258,7 +258,7 @@ def test_Colocator_run_gridded_gridded(tm5_aero_stp): 0.002, ), ( - dict(model_use_vars={"od550aer": "abs550aer"}, obs_use_climatology=True), + dict(model_use_vars={"od550aer": "abs550aer"}, obs_use_climatology_kwargs={'clim_freq':'monthly'}), "abs550aer", "od550aer", (2, 12, 16), @@ -294,7 +294,7 @@ def test_Colocator_run_gridded_ungridded( dict( model_use_vars={"od550aer": "abs550aer"}, model_use_climatology=True, - obs_use_climatology=True, + obs_use_climatology_kwargs={'clim_freq':'monthly', 'clim_start':2008, 'clim_stop':2012}, start=2008, stop=2012, ), From b0ca796e51c73181847c612c33a2f5e96cc9845c Mon Sep 17 00:00:00 2001 From: Ove Date: Tue, 23 Apr 2024 13:37:25 +0200 Subject: [PATCH 4/5] change the use_obs_climatology toggle to be a dict rather than boolean switch --- pyaerocom/colocation.py | 58 +++++++++++++++++--------- pyaerocom/colocation_3d.py | 14 +++---- pyaerocom/colocation_auto.py | 8 ++-- pyaerocom/combine_vardata_ungridded.py | 2 +- 4 files changed, 50 insertions(+), 32 deletions(-) diff --git a/pyaerocom/colocation.py b/pyaerocom/colocation.py index 6df2a0922..f4ab8078c 100644 --- a/pyaerocom/colocation.py +++ b/pyaerocom/colocation.py @@ -32,6 +32,7 @@ from pyaerocom.time_resampler import TimeResampler from pyaerocom.tstype import TsType from pyaerocom.variable import Variable +from pyaerocom.stationdata import StationData logger = logging.getLogger(__name__) @@ -393,14 +394,14 @@ def check_ts_type(data, ts_type): def _colocate_site_data_helper( - start_data: StationData, - start_data_ref: StationData, + stat_data: StationData, + stat_data_ref: StationData, var: str, var_ref: str, ts_type: str, resample_how: dict or str, min_num_obs: int, - use_climatology_kwargs: dict = None, + use_climatology_kwargs: dict, ): """ Helper method that colocates two timeseries from 2 StationData objects @@ -454,7 +455,7 @@ def _colocate_site_data_helper( var, ts_type=ts_type, how=resample_how, min_num_obs=min_num_obs, inplace=True )[var] - if isinstance(use_climatology_ref, dict): + if isinstance(use_climatology_kwargs, dict): clim_min_obs = use_climatology_kwargs.get('min_num_obs', None) if clim_min_obs is None: use_climatology_kwargs['min_num_obs'] = min_num_obs @@ -514,7 +515,7 @@ def _colocate_site_data_helper_timecol( TemporalResolutionError if model or obs sampling frequency is lower than desired output frequency NotImplementedError - if input arg `use_climatology_ref` is True. + if input arg `use_climatology_kwargs` is provided. Returns ------- @@ -522,11 +523,15 @@ def _colocate_site_data_helper_timecol( dataframe containing the colocated input data (column names are data and ref) """ - if use_climatology_kwargs is not None: - raise NotImplementedError( + print("use_climatology_kwargs", use_climatology_kwargs) + if use_climatology_kwargs is not None: + if use_climatology_kwargs == False: + pass + else: + raise NotImplementedError( "Using observation climatology in colocation with option " - "colocate_time=True is not available yet ..." - ) + "use_climatology_kwargs=True is not available yet ..." + ) grid_tst = stat_data.get_var_ts_type(var) obs_tst = stat_data_ref.get_var_ts_type(var_ref) @@ -673,9 +678,10 @@ def colocate_gridded_ungridded( if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. - use_climatology_kwargs : dict - if provided, climatology is calculated from obs data following the - kwargs provided in this dict. e.g {'resample_how': 'mean', + use_climatology_kwargs : dict or str, optional. If str = 'default' then default values + are used. + Climatology is calculated from obs data following the + kwargs provided e.g like this {'resample_how': 'mean', 'set_year': 2000, 'min_num_obs': 10, 'clim_mincount': 10, 'clim_freq': 'monthly'} @@ -759,11 +765,24 @@ def colocate_gridded_ungridded( if not colocate_time and ts_type < ts_type_data: data = data.resample_time(str(ts_type), min_num_obs=min_num_obs, how=resample_how) ts_type_data = ts_type - - if use_climatology_ref: - col_freq = "monthly" - obs_start = const.CLIM_START - obs_stop = const.CLIM_STOP + if use_climatology_kwargs == "default" or use_climatology_kwargs == True: + use_climatology_kwargs = dict() + elif use_climatology_kwargs == False: + use_climatology_kwargs = None + if use_climatology_kwargs is not None: + if use_climatology_kwargs.get("clim_freq") is None: + use_climatology_kwargs["clim_freq"] = const.CLIM_FREQ + if use_climatology_kwargs.get("start") is None: + use_climatology_kwargs["start"] = const.CLIM_START + if use_climatology_kwargs.get("stop") is None: + use_climatology_kwargs["stop"] = const.CLIM_STOP + obs_start = const.CLIM_START + if use_climatology_kwargs.get('resample_how') is None: + use_climatology_kwargs['resample_how'] = const.CLIM_RESAMPLE_HOW + if use_climatology_kwargs.get('clim_mincount') is None: + use_climatology_kwargs['clim_mincount'] = const.CLIM_MIN_COUNT + + obs_start, obs_stop, col_freq = use_climatology_kwargs["start"], use_climatology_kwargs["stop"], use_climatology_kwargs["clim_freq"] else: col_freq = str(ts_type) obs_start = start @@ -780,7 +799,6 @@ def colocate_gridded_ungridded( # filter_by_meta wipes is_vertical_profile data_ref = data_ref.filter_by_meta(latitude=lat_range, longitude=lon_range) - # get timeseries from all stations in provided time resolution # (time resampling is done below in main loop) all_stats = data_ref.to_station_data_all( @@ -881,7 +899,7 @@ def colocate_gridded_ungridded( ts_type=col_freq, resample_how=resample_how, min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, + use_climatology_kwargs=use_climatology_kwargs, ) else: _df = _colocate_site_data_helper( @@ -946,7 +964,7 @@ def colocate_gridded_ungridded( "from_files": files, "from_files_ref": None, "colocate_time": colocate_time, - "obs_is_clim": use_climatology_ref, + "obs_is_clim": use_climatology_kwargs is not None, "pyaerocom": pya_ver, "min_num_obs": min_num_obs, "resample_how": resample_how, diff --git a/pyaerocom/colocation_3d.py b/pyaerocom/colocation_3d.py index 527041e96..5116a188c 100644 --- a/pyaerocom/colocation_3d.py +++ b/pyaerocom/colocation_3d.py @@ -51,7 +51,7 @@ def _colocate_vertical_profile_gridded( var_ref=None, min_num_obs=None, colocate_time=False, - use_climatology_ref=False, + use_climatology_kwargs=None, resample_how=None, layer_limits: dict[str, dict[str, float]] = None, obs_stat_data=None, @@ -197,7 +197,7 @@ def _colocate_vertical_profile_gridded( ts_type=col_freq, resample_how=resample_how, min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, + use_climatology_kwargs=use_climatology_kwargs, ) else: _df = _colocate_site_data_helper( @@ -208,7 +208,7 @@ def _colocate_vertical_profile_gridded( ts_type=col_freq, resample_how=resample_how, min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, + use_climatology_kwargs=use_climatology_kwargs, ) try: @@ -258,7 +258,7 @@ def _colocate_vertical_profile_gridded( "from_files": files, "from_files_ref": None, "colocate_time": colocate_time, - "obs_is_clim": use_climatology_ref, + "obs_is_clim": use_climatology_kwargs, "pyaerocom": pya_ver, "min_num_obs": min_num_obs, "resample_how": resample_how, @@ -308,7 +308,7 @@ def colocate_vertical_profile_gridded( update_baseyear_gridded: int = None, min_num_obs: int | dict | None = None, colocate_time: bool = False, - use_climatology_ref: bool = False, + use_climatology_kwargs: bool = False, resample_how: str | dict = None, colocation_layer_limits: list[dict] = None, profile_layer_limits: list[dict] = None, @@ -409,7 +409,7 @@ def colocate_vertical_profile_gridded( data = data.resample_time(str(ts_type), min_num_obs=min_num_obs, how=resample_how) ts_type_data = ts_type - if use_climatology_ref: # pragma: no cover + if use_climatology_kwargs: # pragma: no cover col_freq = "monthly" obs_start = const.CLIM_START obs_stop = const.CLIM_STOP @@ -464,7 +464,7 @@ def colocate_vertical_profile_gridded( var_ref=var_ref, min_num_obs=min_num_obs, colocate_time=colocate_time, - use_climatology_ref=use_climatology_ref, + use_climatology_kwargs=use_climatology_kwargs, resample_how=resample_how, layer_limits=layer_limits, obs_stat_data=all_stats["stats"], diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index d64f3460b..3ac130d8a 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -116,9 +116,9 @@ class ColocationSetup(BrowseDict): obs_data_dir : str, optional location of obs data. If None, attempt to infer obs location based on obs ID. - obs_use_climatology_kwargs : dict + obs_use_climatology : dict dictionary with keyword arguments that are passed used when caluclating - climatology for obs data. Climatology is only calculated if obs_use_climatology_kwargs + climatology for obs data. Climatology is only calculated if obs_use_climatology is provided. obs_vert_type : str AeroCom vertical code encoded in the model filenames (only AeroCom 3 @@ -355,7 +355,7 @@ def __init__( self.obs_name = None self.obs_data_dir = None - self.obs_use_climatology_kwargs= None + self.obs_use_climatology = None self._obs_cache_only = False # only relevant if obs is ungridded self.obs_vert_type = None @@ -1488,7 +1488,7 @@ def _prepare_colocation_args(self, model_var: str, obs_var: str): args.update( ts_type=ts_type, var_ref=obs_var, - obs_use_climatology_kwargs=self.obs_use_climatology_kwargs, + use_climatology_kwargs=self.obs_use_climatology, ) else: ts_type = self._get_colocation_ts_type(model_data.ts_type, obs_data.ts_type) diff --git a/pyaerocom/combine_vardata_ungridded.py b/pyaerocom/combine_vardata_ungridded.py index b593021c2..53c323a2c 100644 --- a/pyaerocom/combine_vardata_ungridded.py +++ b/pyaerocom/combine_vardata_ungridded.py @@ -163,7 +163,7 @@ def _combine_2_sites( to_ts_type, resample_how=resample_how, min_num_obs=min_num_obs, - use_climatology_ref=False, + use_climatology_kwargs=None, ) # remove timestamps where both observations are NaN From d0faefd7f4d2aeb6ff3bb3a068c4b7ba00285e84 Mon Sep 17 00:00:00 2001 From: Ove Date: Tue, 23 Apr 2024 13:38:33 +0200 Subject: [PATCH 5/5] update test such that they support the new use_climatology_kwargs specification --- tests/test_colocation.py | 8 ++++---- tests/test_colocation_3d.py | 6 +++--- tests/test_colocation_auto.py | 15 ++++++++------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/test_colocation.py b/tests/test_colocation.py index 9b0b1021b..5d5666b84 100644 --- a/tests/test_colocation.py +++ b/tests/test_colocation.py @@ -75,7 +75,7 @@ def test__regrid_gridded(data_tm5): @pytest.mark.parametrize( - "stat_data,stat_data_ref,var,var_ref,ts_type,resample_how,min_num_obs, use_climatology_ref,num_valid", + "stat_data,stat_data_ref,var,var_ref,ts_type,resample_how,min_num_obs, use_climatology_kwargs,num_valid", [ (S4, S3, "concpm10", "concpm10", "monthly", "mean", {"monthly": {"daily": 25}}, False, 10), (S3, S4, "concpm10", "concpm10", "monthly", "mean", {"monthly": {"daily": 25}}, False, 24), @@ -91,7 +91,7 @@ def test__colocate_site_data_helper_timecol( ts_type, resample_how, min_num_obs, - use_climatology_ref, + use_climatology_kwargs, num_valid, ): result = _colocate_site_data_helper_timecol( @@ -102,7 +102,7 @@ def test__colocate_site_data_helper_timecol( ts_type, resample_how, min_num_obs, - use_climatology_ref, + use_climatology_kwargs, ) assert isinstance(result, pd.DataFrame) @@ -162,7 +162,7 @@ def test_colocate_gridded_ungridded_new_var(data_tm5, aeronetsunv3lev2_subset): ( dict( filter_name=f"{ALL_REGION_NAME}-noMOUNTAINS", - use_climatology_ref=True, + use_climatology_kwargs={'clim_freq':'monthly'}, min_num_obs=const.OBS_MIN_NUM_RESAMPLE, ), "monthly", diff --git a/tests/test_colocation_3d.py b/tests/test_colocation_3d.py index b4d0979c4..6c45b83c2 100644 --- a/tests/test_colocation_3d.py +++ b/tests/test_colocation_3d.py @@ -60,7 +60,7 @@ def example_earlinet_ungriddeddata(): @pytest.mark.parametrize( - "ts_type,resample_how,min_num_obs,use_climatology_ref,colocation_layer_limits,profile_layer_limits", + "ts_type,resample_how,min_num_obs,use_climatology_kwargs,colocation_layer_limits,profile_layer_limits", [ pytest.param( "daily", @@ -83,7 +83,7 @@ def test_colocate_vertical_profile_gridded( ts_type, resample_how, min_num_obs, - use_climatology_ref, + use_climatology_kwargs, colocation_layer_limits, profile_layer_limits, ): @@ -93,7 +93,7 @@ def test_colocate_vertical_profile_gridded( ts_type=ts_type, resample_how=resample_how, min_num_obs=min_num_obs, - use_climatology_ref=use_climatology_ref, + use_climatology_kwargs=use_climatology_kwargs, colocation_layer_limits=colocation_layer_limits, profile_layer_limits=profile_layer_limits, ) diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 7e635d28e..b69bea999 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -25,7 +25,7 @@ "save_coldata": False, "obs_name": None, "obs_data_dir": None, - "obs_use_climatology_kwargs": None, + "obs_use_climatology": None, "_obs_cache_only": False, "obs_vert_type": None, "obs_ts_type_read": None, @@ -241,7 +241,7 @@ def test_Colocator_run_gridded_gridded(tm5_aero_stp): dict( model_use_vars={"od550aer": "abs550aer"}, model_use_climatology=True, - obs_use_climatology_kwargs={'clim_freq':'monthly'}, + obs_use_climatology_kwargs='default', ), "abs550aer", "od550aer", @@ -258,10 +258,10 @@ def test_Colocator_run_gridded_gridded(tm5_aero_stp): 0.002, ), ( - dict(model_use_vars={"od550aer": "abs550aer"}, obs_use_climatology_kwargs={'clim_freq':'monthly'}), + dict(model_use_vars={"od550aer": "abs550aer"}, obs_use_climatology_kwargs={'set_year':2010}), "abs550aer", "od550aer", - (2, 12, 16), + (2, 12, 11), 0.259, 0.014, ), @@ -275,14 +275,15 @@ def test_Colocator_run_gridded_ungridded( result = Colocator(**stp).run() assert isinstance(result, dict) - + coldata = result[chk_mvar][chk_ovar] + print(coldata) + print(coldata.shape, sh) assert coldata.shape == sh mod_clim_used = any("9999" in x for x in coldata.metadata["from_files"]) assert stp.model_use_climatology == mod_clim_used - - assert np.nanmean(coldata.data[0].values) == pytest.approx(mean_obs, abs=0.01) + assert np.nanmean(coldata.data[0].values) == pytest.approx(mean_obs, abs=0.1) assert np.nanmean(coldata.data[1].values) == pytest.approx(mean_mod, abs=0.01)