Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Average time #211

Merged
merged 8 commits into from
Feb 10, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ v0.8.0 (unreleased)
New Features
^^^^^^^^^^^^
* ``clisops.core.average.average_shape`` copies the global and variable attributes from the input data to the results.
* ``clisops.ops.average.average_time`` and ``clisops.core.average.average_time`` added. Allowing averaging over time frequencies of day, month and year.

Bug fixes
^^^^^^^^^
Expand Down
71 changes: 66 additions & 5 deletions clisops/core/average.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
"""Average module."""
from pathlib import Path
from typing import Tuple, Union, Sequence
import warnings
import numpy as np
from pathlib import Path
from typing import Sequence, Tuple, Union

import geopandas as gpd
import numpy as np
import xarray as xr
from roocs_utils.exceptions import InvalidParameterValue
from roocs_utils.xarray_utils.xarray_utils import get_coord_type, known_coord_types
from roocs_utils.xarray_utils.xarray_utils import (
get_coord_by_type,
get_coord_type,
known_coord_types,
)

from .subset import shape_bbox_indexer

__all__ = ["average_over_dims", "average_shape"]
__all__ = ["average_over_dims", "average_shape", "average_time"]

# see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
freqs = {"day": "1D", "month": "1MS", "year": "1AS"}


def average_shape(
Expand Down Expand Up @@ -216,3 +225,55 @@ def average_over_dims(
if isinstance(ds, xr.Dataset):
return xr.merge((ds_averaged_over_dims, untouched_ds))
return ds_averaged_over_dims


def average_time(
ds: Union[xr.DataArray, xr.Dataset],
freq: str,
) -> Union[xr.DataArray, xr.Dataset]:
"""
Average a DataArray or Dataset over the time frequency specified.

Parameters
----------
ds : Union[xr.DataArray, xr.Dataset]
Input values.
freq: str
The frequency to average over. One of "month", "year".

Returns
-------
Union[xr.DataArray, xr.Dataset]
New Dataset or DataArray object averaged over the indicated time frequency.

Examples
--------
>>> from clisops.core.average import average_time # doctest: +SKIP
>>> pr = xr.open_dataset(path_to_pr_file).pr # doctest: +SKIP
...
# Average data array over each month
>>> prAvg = average_time(pr, freq='month') # doctest: +SKIP
"""

if not freq:
raise InvalidParameterValue(
"At least one frequency for averaging must be provided"
)

if freq not in list(freqs.keys()):
raise InvalidParameterValue(
f"Time frequency for averaging must be one of {list(freqs.keys())}."
)

# check time coordinate exists and get name
t = get_coord_by_type(ds, "time", ignore_aux_coords=False)

if t is None:
raise Exception("Time dimension could not be found")

# resample and average over time
ds_avg_over_time = ds.resample(indexer={t.name: freqs[freq]}).mean(
dim=t.name, skipna=True, keep_attrs=True
)

return ds_avg_over_time
77 changes: 74 additions & 3 deletions clisops/ops/average.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List, Optional, Tuple, Union

import xarray as xr
from roocs_utils.exceptions import InvalidParameterValue
from roocs_utils.parameter.dimension_parameter import DimensionParameter
from roocs_utils.xarray_utils.xarray_utils import (
convert_coord_to_axis,
Expand All @@ -17,9 +18,7 @@
from clisops.utils.file_namers import get_file_namer
from clisops.utils.output_utils import get_output, get_time_slices

__all__ = [
"average_over_dims",
]
__all__ = ["average_over_dims", "average_time"]

LOGGER = logging.getLogger(__file__)

Expand Down Expand Up @@ -97,3 +96,75 @@ def average_over_dims(
"""
op = Average(**locals())
return op.process()


class AverageTime(Operation):
def _resolve_params(self, **params):
freq = params.get("freq", None)

if not freq:
raise InvalidParameterValue(
"At least one frequency for averaging must be provided"
)

if freq not in list(average.freqs.keys()):
raise InvalidParameterValue(
f"Time frequency for averaging must be one of {list(average.freqs.keys())}."
)

self.params = {"freq": freq}

def _get_file_namer(self):
extra = f"_avg-{self.params.get('freq')}"
namer = get_file_namer(self._file_namer)(extra=extra)

return namer

def _calculate(self):
avg_ds = average.average_time(
self.ds,
self.params.get("freq", None),
)

return avg_ds


def average_time(
ds,
freq: str,
output_dir: Optional[Union[str, Path]] = None,
output_type="netcdf",
split_method="time:auto",
file_namer="standard",
) -> List[Union[xr.Dataset, str]]:
"""

Parameters
----------
ds: Union[xr.Dataset, str]
freq: str
The frequency to average over. One of "month", "year".
output_dir: Optional[Union[str, Path]] = None
output_type: {"netcdf", "nc", "zarr", "xarray"}
split_method: {"time:auto"}
file_namer: {"standard", "simple"}

Returns
-------
List[Union[xr.Dataset, str]]
A list of the outputs in the format selected; str corresponds to file paths if the
output format selected is a file.

Examples
--------
| ds: xarray Dataset or "cmip5.output1.MOHC.HadGEM2-ES.rcp85.mon.atmos.Amon.r1i1p1.latest.tas"
| dims: ['latitude', 'longitude']
| ignore_undetected_dims: False
| output_dir: "/cache/wps/procs/req0111"
| output_type: "netcdf"
| split_method: "time:auto"
| file_namer: "standard"

"""
op = AverageTime(**locals())
return op.process()
4 changes: 2 additions & 2 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ Subset operation
:show-inheritance:


Average operation
=================
Average operations
==================

.. automodule:: clisops.ops.average
:noindex:
Expand Down
53 changes: 53 additions & 0 deletions tests/core/test_average.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
import xarray as xr
from pkg_resources import parse_version
from roocs_utils.exceptions import InvalidParameterValue
from roocs_utils.xarray_utils import xarray_utils as xu

from clisops.core import average
from clisops.utils import get_file

from .._common import CMIP5_RH, CMIP6_SICONC_DAY
from .._common import XCLIM_TESTS_DATA as TESTS_DATA

try:
Expand Down Expand Up @@ -173,3 +175,54 @@ def test_average_wrong_format(self):
str(exc.value)
== "Dimensions for averaging must be one of ['time', 'level', 'latitude', 'longitude']"
)


class TestAverageTime:
month_ds = CMIP5_RH
day_ds = CMIP6_SICONC_DAY

def test_average_month(self):
ds = xu.open_xr_dataset(self.day_ds)
assert ds.time.shape == (60225,)

avg_ds = average.average_time(ds, freq="month")
assert avg_ds.time.shape == (1980,)

def test_average_year(self):
ds = xu.open_xr_dataset(self.month_ds)
assert ds.time.shape == (1752,)

avg_ds = average.average_time(ds, freq="year")
assert avg_ds.time.shape == (147,)

def test_no_freq(self):
ds = xu.open_xr_dataset(self.month_ds)

with pytest.raises(InvalidParameterValue) as exc:
average.average_time(ds, freq=None)
assert str(exc.value) == "At least one frequency for averaging must be provided"

def test_incorrect_freq(self):
ds = xu.open_xr_dataset(self.month_ds)

with pytest.raises(InvalidParameterValue) as exc:
average.average_time(ds, freq="wrong")
assert (
str(exc.value)
== "Time frequency for averaging must be one of ['day', 'month', 'year']."
)

def test_freq_wrong_format(self):
ds = xu.open_xr_dataset(self.month_ds)

with pytest.raises(InvalidParameterValue) as exc:
average.average_time(ds, freq=0)
assert str(exc.value) == "At least one frequency for averaging must be provided"

def test_no_time(self):
ds = xu.open_xr_dataset(self.month_ds)
ds = ds.drop_dims("time")

with pytest.raises(Exception) as exc:
average.average_time(ds, freq="year")
assert str(exc.value) == "Time dimension could not be found"
113 changes: 110 additions & 3 deletions tests/ops/test_average.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@

import clisops
from clisops import CONFIG
from clisops.ops.average import average_over_dims
from clisops.ops.average import average_over_dims, average_time

from .._common import CMIP5_TAS
from .._common import C3S_CORDEX_EUR_ZG500, CMIP5_TAS, CMIP6_SICONC_DAY


def _check_output_nc(result, fname="output_001.nc"):
assert fname in [os.path.basename(_) for _ in result]


def _load_ds(fpath):
return xr.open_mfdataset(fpath)
return xr.open_mfdataset(fpath, use_cftime=True)


def test_average_basic_data_array(cmip5_tas_file):
Expand Down Expand Up @@ -217,3 +217,110 @@ def test_aux_variables():
)

assert "do_i_get_written" in result[0].variables


def test_average_over_years():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ellesmith88 this is all looking good and the tests seem to cover everything we'd need.
There is one issue that we might need to consider philosophically (about what xarray is doing):

In the test test_average_over_years():

  • the input data starts on 16/12/2005 and ends 16/12/2299.
  • the output data starts on 01/01/2005 and ends on 01/01/2299

In CF terms, we can think of the new time array as being a series of time periods rather than time points (i.e. time point 1 represents the entire year of 2005). In which case, the expected output array would typically be defined with a time axis and time_bounds bounds array.

And typically the first time point would be:

time[0] = 2005-07-01 (i.e. halfway(ish) through the year)
time_bounds[0] = [2005-01-01, 2299-12-31]

However, that is not a definitive rule. I think the first thing to do is to test the values of the bounds. If the bounds come out correctly then we can probably just accept what xarray is doing.

Please can you extend this test (and some of the others) to include a check for bounds values for time in a way that is logical.

Note that xarray has all these different frequencies:

https://github.com/pydata/xarray/blob/main/xarray/coding/cftime_offsets.py#L605-L608

I'm not sure if the YearEnd and YearBegin options might be relevant to this issue in some way.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@agstephens having a look at the output arrays, it seems that there are no time bounds.

For example, from the test test_average_over_years(), the output dataset is:

<xarray.Dataset>
Dimensions:   (time: 295, lat: 2, bnds: 2, lon: 2)
Coordinates:
  * time      (time) object 2005-01-01 00:00:00 ... 2299-01-01 00:00:00
    height    float64 1.5
  * lat       (lat) float64 -90.0 35.0
  * lon       (lon) float64 0.0 187.5
Dimensions without coordinates: bnds
Data variables:
    lat_bnds  (time, lat, bnds) float64 dask.array<chunksize=(1, 2, 2), meta=np.ndarray>
    lon_bnds  (time, lon, bnds) float64 dask.array<chunksize=(1, 2, 2), meta=np.ndarray>
    tas       (time, lat, lon) float32 dask.array<chunksize=(1, 2, 2), meta=np.ndarray>

The frequencies I'm using for calculating the time average are:
month: 1MS
year: 1AS

and I think the 'S' indicates using the start of the year or start of the month for resampling. I think this is probably what we want it to do, but I might be confused!
Other options include one such as AS-JUN, but I think this means the average is calculated over June - May rather than over Jan - Dec.

There are options in xarray resample (https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html) to change the label for the output times, but only to the left or right of the interval, so this isn't very helpful.

I can't see anything else about changing the label/ indicating the time period each time point covers, but will have another look to see what other options there might be

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @ellesmith88, thanks for looking into this.

I wonder if they have looked into this within the CDS code. Please can you get in touch with James (ECMWF) to ask for his thoughts on this issue. I wonder if they have come across and tackled this problem - or whether they are just happy to accept what xarray produces.

Thanks

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ellesmith88 Just looking at your work so far on this. I would propose that we might want to re-use a create_time_bounds(...) function in other code (in future). So please can you move out that functionality into a separate function. I think it might get used outside of clisops.core and clisops.ops so you could put them in a new module: clisops.utils.time_utils.

ds = _load_ds(CMIP5_TAS) # monthly dataset

# check initial dataset
assert ds.time.shape == (3530,)
assert ds.time.values[0].isoformat() == "2005-12-16T00:00:00"
assert ds.time.values[-1].isoformat() == "2299-12-16T00:00:00"

result = average_time(
CMIP5_TAS,
freq="year",
output_type="xarray",
)

time_length = ds.time.values[-1].year - ds.time.values[0].year + 1

assert result[0].time.shape == (time_length,) # get number of years
assert result[0].time.values[0].isoformat() == "2005-01-01T00:00:00"
assert result[0].time.values[-1].isoformat() == "2299-01-01T00:00:00"


def test_average_over_months():
ds = _load_ds(CMIP6_SICONC_DAY) # monthly dataset

# check initial dataset
assert ds.time.shape == (60225,)
assert ds.time.values[0].isoformat() == "1850-01-01T12:00:00"
assert ds.time.values[-1].isoformat() == "2014-12-31T12:00:00"

# average over time
result = average_time(
CMIP6_SICONC_DAY,
freq="month",
output_type="xarray",
)

time_length = (
ds.time.values[-1].year - ds.time.values[0].year + 1
) * 12 # get number of months

assert result[0].time.shape == (time_length,)
assert result[0].time.values[0].isoformat() == "1850-01-01T00:00:00"
assert result[0].time.values[-1].isoformat() == "2014-12-01T00:00:00"


def test_average_time_no_freq():
with pytest.raises(InvalidParameterValue) as exc:
# average over time
average_time(
CMIP6_SICONC_DAY,
freq=None,
output_type="xarray",
)
assert str(exc.value) == "At least one frequency for averaging must be provided"


def test_average_time_incorrect_freq():
with pytest.raises(InvalidParameterValue) as exc:
# average over time
average_time(
CMIP6_SICONC_DAY,
freq="week",
output_type="xarray",
)
assert (
str(exc.value)
== "Time frequency for averaging must be one of ['day', 'month', 'year']."
)


def test_average_time_file_name(tmpdir):
result = average_time(
CMIP5_TAS,
freq="year",
output_type="nc",
output_dir=tmpdir,
)

_check_output_nc(
result, fname="tas_mon_HadGEM2-ES_rcp85_r1i1p1_20050101-22990101_avg-year.nc"
)


def test_average_time_cordex():
ds = _load_ds(C3S_CORDEX_EUR_ZG500)

# check initial dataset
assert ds.time.shape == (3653,)
assert ds.time.values[0].isoformat() == "2071-01-01T12:00:00"
assert ds.time.values[-1].isoformat() == "2080-12-31T12:00:00"

# average over time
result = average_time(
C3S_CORDEX_EUR_ZG500,
freq="month",
output_type="xarray",
)

time_length = (
ds.time.values[-1].year - ds.time.values[0].year + 1
) * 12 # get number of months

assert result[0].time.shape == (time_length,)
assert result[0].time.values[0].isoformat() == "2071-01-01T00:00:00"
assert result[0].time.values[-1].isoformat() == "2080-12-01T00:00:00"