From bbfc476f945258b9f2aff4696cf2c0415adcb5d0 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:29:25 +0200 Subject: [PATCH 01/23] Introduce new arguments limit_direction, limit_area, limit_use coordinate --- xarray/core/dataarray.py | 140 ++++++++--- xarray/core/dataset.py | 142 +++++++---- xarray/core/missing.py | 315 +++++++++++++++++------- xarray/core/types.py | 2 + xarray/tests/test_missing.py | 460 +++++++++++++++++++++++++++++++++-- 5 files changed, 867 insertions(+), 192 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 84f229bf575..e48a07cc883 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -103,6 +103,8 @@ ErrorOptions, ErrorOptionsWithWarn, InterpOptions, + LimitAreaOptions, + LimitDirectionOptions, PadModeOptions, PadReflectOptions, QuantileMethods, @@ -3476,10 +3478,21 @@ def fillna(self, value: Any) -> Self: def interpolate_na( self, - dim: Hashable | None = None, + dim: Hashable, method: InterpOptions = "linear", - limit: int | None = None, - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate: bool | Hashable = False, max_gap: ( None | int @@ -3496,7 +3509,7 @@ def interpolate_na( Parameters ---------- - dim : Hashable or None, optional + dim : Hashable Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -3511,17 +3524,54 @@ def interpolate_na( - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their respective :py:class:`scipy.interpolate` classes. - use_coordinate : bool or str, default: True + use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. If False, values are treated as if - equally-spaced along ``dim``. If True, the IndexVariable `dim` is - used. If ``use_coordinate`` is a string, it specifies the name of a - coordinate variable to use as the index. - limit : int or None, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``limit_use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only interpolate over gaps less than a given length, see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values (interpolate). + - "outside": Only fill NaNs outside valid values (extrapolate). + + limit_use_coordinate : bool or Hashable, default: True + Specifies which index to use for the ``limit`` distance. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3532,8 +3582,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled - dimensions has not been implemented yet. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -3557,33 +3607,62 @@ def interpolate_na( interpolated: DataArray Filled in DataArray. + Warning + -------- + When passing fill_value as a keyword argument with method="linear", it does not use + ``numpy.interp`` but it uses ``scipy.interpolate.interp1d``, which provides the fill_value parameter. + See Also -------- numpy.interp scipy.interpolate + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. Examples -------- >>> da = xr.DataArray( - ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} + ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], + ... dims="x", + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) >>> da - Size: 40B - array([nan, 2., 3., nan, 0.]) + + array([nan, 2., nan, nan, 5., nan, 0.]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 - + * x (x) int64 0 1 2 3 4 5 6 >>> da.interpolate_na(dim="x", method="linear") - Size: 40B - array([nan, 2. , 3. , 1.5, 0. ]) + + array([nan, 2. , 3. , 4. , 5. , 2.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 - - >>> da.interpolate_na(dim="x", method="linear", fill_value="extrapolate") - Size: 40B - array([1. , 2. , 3. , 1.5, 0. ]) + * x (x) int64 0 1 2 3 4 5 6 + >>> da.interpolate_na( + ... dim="x", + ... method="linear", + ... limit_direction="both", + ... fill_value="extrapolate", + ... ) + + array([1. , 2. , 3. , 4. , 5. , 2.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 + >>> da.interpolate_na( + ... dim="x", method="linear", limit=1, limit_direction="forward" + ... ) + + array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 + >>> da.interpolate_na( + ... dim="x", method="linear", max_gap=2, limit_direction="forward" + ... ) + + array([nan, 2. , nan, nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 """ from xarray.core.missing import interp_na @@ -3591,8 +3670,11 @@ def interpolate_na( self, dim=dim, method=method, - limit=limit, use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + limit_use_coordinate=limit_use_coordinate, max_gap=max_gap, keep_attrs=keep_attrs, **kwargs, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index dbc00a03025..065594481bc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -156,6 +156,8 @@ ErrorOptionsWithWarn, InterpOptions, JoinOptions, + LimitAreaOptions, + LimitDirectionOptions, PadModeOptions, PadReflectOptions, QueryEngineOptions, @@ -6590,10 +6592,21 @@ def fillna(self, value: Any) -> Self: def interpolate_na( self, - dim: Hashable | None = None, + dim: Hashable, method: InterpOptions = "linear", - limit: int | None = None, use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate: bool | Hashable = False, max_gap: ( int | float @@ -6603,13 +6616,14 @@ def interpolate_na( | datetime.timedelta | None ) = None, + keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: """Fill in NaNs by interpolating according to different methods. Parameters ---------- - dim : Hashable or None, optional + dim : Hashable Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -6626,15 +6640,52 @@ def interpolate_na( use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. If False, values are treated as if - equally-spaced along ``dim``. If True, the IndexVariable `dim` is - used. If ``use_coordinate`` is a string, it specifies the name of a - coordinate variable to use as the index. - limit : int, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``limit_use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only interpolate over gaps less than a given length, see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values (interpolate). + - "outside": Only fill NaNs outside valid values (extrapolate). + + limit_use_coordinate : bool or Hashable, default: True + Specifies which index to use for the ``limit`` distance. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta \ or None, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. @@ -6646,8 +6697,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled - dimensions has not been implemented yet. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -6659,6 +6710,10 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6677,49 +6732,50 @@ def interpolate_na( numpy.interp scipy.interpolate + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + Examples -------- >>> ds = xr.Dataset( ... { - ... "A": ("x", [np.nan, 2, 3, np.nan, 0]), - ... "B": ("x", [3, 4, np.nan, 1, 7]), - ... "C": ("x", [np.nan, np.nan, np.nan, 5, 0]), - ... "D": ("x", [np.nan, 3, np.nan, -1, 4]), + ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), + ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), ... }, - ... coords={"x": [0, 1, 2, 3, 4]}, + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) >>> ds - Size: 200B - Dimensions: (x: 5) + + Dimensions: (x: 7) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 Data variables: - A (x) float64 40B nan 2.0 3.0 nan 0.0 - B (x) float64 40B 3.0 4.0 nan 1.0 7.0 - C (x) float64 40B nan nan nan 5.0 0.0 - D (x) float64 40B nan 3.0 nan -1.0 4.0 - - >>> ds.interpolate_na(dim="x", method="linear") - Size: 200B - Dimensions: (x: 5) + A (x) float64 nan 2.0 nan nan 5.0 nan 0.0 + B (x) float64 nan 2.0 nan nan 5.0 6.0 nan + >>> ds.interpolate_na( + ... dim="x", + ... method="linear", + ... limit_direction="both", + ... fill_value="extrapolate", + ... ) + + Dimensions: (x: 7) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 Data variables: - A (x) float64 40B nan 2.0 3.0 1.5 0.0 - B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 - C (x) float64 40B nan nan nan 5.0 0.0 - D (x) float64 40B nan 3.0 1.0 -1.0 4.0 - - >>> ds.interpolate_na(dim="x", method="linear", fill_value="extrapolate") - Size: 200B - Dimensions: (x: 5) + A (x) float64 1.0 2.0 3.0 4.0 5.0 2.5 0.0 + B (x) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 + >>> ds.interpolate_na( + ... dim="x", method="linear", limit=1, limit_direction="forward" + ... ) + + Dimensions: (x: 7) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 0 1 2 3 4 5 6 Data variables: - A (x) float64 40B 1.0 2.0 3.0 1.5 0.0 - B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 - C (x) float64 40B 20.0 15.0 10.0 5.0 0.0 - D (x) float64 40B 5.0 3.0 1.0 -1.0 4.0 + A (x) float64 nan 2.0 3.0 nan 5.0 2.5 0.0 + B (x) float64 nan 2.0 3.0 nan 5.0 6.0 nan """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 187a93d322f..2f8993d9705 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd -from xarray.core import utils from xarray.core.common import _contains_datetime_like_objects, ones_like from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import ( @@ -20,7 +19,12 @@ timedelta_to_numeric, ) from xarray.core.options import _get_keep_attrs -from xarray.core.types import Interp1dOptions, InterpOptions +from xarray.core.types import ( + Interp1dOptions, + InterpOptions, + LimitAreaOptions, + LimitDirectionOptions, +) from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import Variable, broadcast_variables from xarray.namedarray.parallelcompat import get_chunked_array_type @@ -31,6 +35,84 @@ from xarray.core.dataset import Dataset +def _get_gap_left_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False +): + arange = ones_like(obj) * index + left = arange.where(~obj.isnull()).ffill(dim) + if outside: + return left.fillna(index[0]) + return left + + +def _get_gap_right_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False +): + arange = ones_like(obj) * index + right = arange.where(~obj.isnull()).bfill(dim) + if outside: + return right.fillna(index[-1]) + return right + + +def _get_gap_dist_to_left_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable +): + arange = ones_like(obj) * index + return arange - _get_gap_left_edge(obj, dim, index) + + +def _get_gap_dist_to_right_edge( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable +): + arange = ones_like(obj) * index + return _get_gap_right_edge(obj, dim, index) - arange + + +def _get_limit_fill_mask( + obj: Dataset | DataArray | Variable, + dim: Hashable, + index: Variable, + limit, + limit_direction, +): + if limit_direction == "forward": + limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) <= limit + elif limit_direction == "backward": + limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) <= limit + elif limit_direction == "both": + limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) <= limit) | ( + _get_gap_dist_to_right_edge(obj, dim, index) <= limit + ) + else: + raise ValueError( + f"limit_direction must be one of 'forward', 'backward', 'both'. Got {limit_direction}" + ) + return limit_mask + + +def _get_limit_area_mask( + obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, limit_area +): + if limit_area == "inside": + area_mask = ( + _get_gap_left_edge(obj, dim, index).notnull() + & _get_gap_right_edge(obj, dim, index).notnull() + ) + area_mask = area_mask | obj.notnull() + elif limit_area == "outside": + area_mask = ( + _get_gap_left_edge(obj, dim, index).isnull() + | _get_gap_right_edge(obj, dim, index).isnull() + ) + area_mask = area_mask | obj.notnull() + else: + raise ValueError( + f"limit_area must be one of 'inside', 'outside' or None. Got {limit_area}" + ) + return area_mask + + def _get_nan_block_lengths( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable ): @@ -38,26 +120,78 @@ def _get_nan_block_lengths( Return an object where each NaN element in 'obj' is replaced by the length of the gap the element is in. """ + return _get_gap_right_edge(obj, dim, index, outside=True) - _get_gap_left_edge( + obj, dim, index, outside=True + ) - # make variable so that we get broadcasting for free - index = Variable([dim], index) - # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072 - arange = ones_like(obj) * index - valid = obj.notnull() - valid_arange = arange.where(valid) - cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0]) - - nan_block_lengths = ( - cumulative_nans.diff(dim=dim, label="upper") - .reindex({dim: obj[dim]}) - .where(valid) - .bfill(dim=dim) - .where(~valid, 0) - .fillna(index[-1] - valid_arange.max(dim=[dim])) +def _get_max_gap_mask( + obj: Dataset | DataArray | Variable, + dim: Hashable, + index: Variable, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta, +): + nan_block_lengths = _get_nan_block_lengths(obj, dim, index) + return nan_block_lengths <= max_gap + + +def _get_gap_masks( + obj: Dataset | DataArray | Variable, + dim: Hashable, + limit=None, + limit_direction="both", + limit_area=None, + limit_use_coordinate=False, + max_gap=None, + max_gap_use_coordinate=False, +): + # Input checking + ##Limit + if not is_scalar(limit): + raise ValueError("limit must be a scalar.") + + if limit is None: + limit = np.inf + else: + if limit_use_coordinate is False: + if not isinstance(limit, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point limit since limit_use_coordinate=False. Received {type(limit).__name__}." + ) + if _is_time_index(_get_raw_interp_index(obj, dim, limit_use_coordinate)): + limit = timedelta_to_numeric(limit) + + ## Max_gap + if max_gap is not None: + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + + if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): + max_gap = timedelta_to_numeric(max_gap) + + if not max_gap_use_coordinate: + if not isinstance(max_gap, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." + ) + # Calculate indexes + index_limit = get_clean_interp_index(obj, dim, use_coordinate=limit_use_coordinate) + index_max_gap = get_clean_interp_index( + obj, dim, use_coordinate=max_gap_use_coordinate ) + # Calculate fill masks + limit_mask = None + if limit != np.inf or limit_direction != "both": + limit_mask = _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) - return nan_block_lengths + limit_area_mask = None + if limit_area is not None: + limit_area_mask = _get_limit_area_mask(obj, dim, index_limit, limit_area) + + max_gap_mask = None + if max_gap is not None: + max_gap_mask = _get_max_gap_mask(obj, dim, index_max_gap, max_gap) + return limit_mask, limit_area_mask, max_gap_mask class BaseInterpolator: @@ -224,8 +358,39 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds +def _get_raw_interp_index(arr, dim: Hashable, use_coordinate: bool | Hashable = True): + """Return index to use for x values in interpolation or curve fitting. + In comparison to get_clean_interp_index, this function does not convert + to numeric values.""" + + if dim not in arr.dims: + raise ValueError(f"{dim} is not a valid dimension") + + if use_coordinate is False: + return pd.RangeIndex(arr.sizes[dim], name=dim) + + elif use_coordinate is True: + coordinate = arr.coords[ + dim + ] # this will default to a linear coordinate, if no index is present + else: # string/hashable + coordinate = arr.coords[use_coordinate] + if dim not in coordinate.dims: + raise ValueError( + f"Coordinate given by {use_coordinate} must have dimension {dim}." + ) + + if coordinate.ndim != 1: + raise ValueError( + f"Coordinates used for interpolation must be 1D, " + f"{use_coordinate} is {coordinate.ndim}D." + ) + index = coordinate.to_index() + return index + + def get_clean_interp_index( - arr, dim: Hashable, use_coordinate: str | bool = True, strict: bool = True + arr, dim: Hashable, use_coordinate: bool | Hashable = True, strict: bool = True ): """Return index to use for x values in interpolation or curve fitting. @@ -235,7 +400,7 @@ def get_clean_interp_index( Array to interpolate or fit to a curve. dim : str Name of dimension along which to fit. - use_coordinate : str or bool + use_coordinate : bool or hashable If use_coordinate is True, the coordinate that shares the name of the dimension along which interpolation is being performed will be used as the x values. If False, the x values are set as an equally spaced sequence. @@ -253,26 +418,10 @@ def get_clean_interp_index( to time deltas with respect to 1970-01-01. """ - # Question: If use_coordinate is a string, what role does `dim` play? from xarray.coding.cftimeindex import CFTimeIndex - if use_coordinate is False: - axis = arr.get_axis_num(dim) - return np.arange(arr.shape[axis], dtype=np.float64) - - if use_coordinate is True: - index = arr.get_index(dim) - - else: # string - index = arr.coords[use_coordinate] - if index.ndim != 1: - raise ValueError( - f"Coordinates used for interpolation must be 1D, " - f"{use_coordinate} is {index.ndim}D." - ) - index = index.to_index() - - # TODO: index.name is None for multiindexes + index = _get_raw_interp_index(arr, dim, use_coordinate) + # index.name is None for multiindexes # set name for nice error messages below if isinstance(index, pd.MultiIndex): index.name = dim @@ -305,51 +454,52 @@ def get_clean_interp_index( f"Index {index.name!r} must be castable to float64 to support " f"interpolation or curve fitting, got {type(index).__name__}." ) - + index = Variable([dim], index) return index +def _is_time_index(index): + from xarray.coding.cftimeindex import CFTimeIndex + + return isinstance(index, (pd.DatetimeIndex, CFTimeIndex)) + + def interp_na( self, dim: Hashable | None = None, - use_coordinate: bool | str = True, method: InterpOptions = "linear", - limit: int | None = None, - max_gap: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, + use_coordinate: bool | str = True, + limit: int + | float + | str + | pd.Timedelta + | np.timedelta64 + | dt.timedelta + | None = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + limit_use_coordinate: bool + | str = False, # backward compatibility + pandas (2.1.4) compatibility + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None = None, keep_attrs: bool | None = None, **kwargs, ): """Interpolate values according to different methods.""" - from xarray.coding.cftimeindex import CFTimeIndex + # Preprocess arguments and do consistency checks if dim is None: raise NotImplementedError("dim is a required argument") - if limit is not None: - valids = _get_valid_fill_mask(self, dim, limit) - - if max_gap is not None: - max_type = type(max_gap).__name__ - if not is_scalar(max_gap): - raise ValueError("max_gap must be a scalar.") - - if ( - dim in self._indexes - and isinstance( - self._indexes[dim].to_pandas_index(), pd.DatetimeIndex | CFTimeIndex - ) - and use_coordinate - ): - # Convert to float - max_gap = timedelta_to_numeric(max_gap) - - if not use_coordinate: - if not isinstance(max_gap, Number | np.number): - raise TypeError( - f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}." - ) + masks = _get_gap_masks( + self, + dim, + limit, + limit_direction, + limit_area, + limit_use_coordinate, + max_gap, + use_coordinate, + ) # method index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) @@ -365,7 +515,7 @@ def interp_na( arr = apply_ufunc( interpolator, self, - index, + index.values, input_core_dims=[[dim], [dim]], output_core_dims=[[dim]], output_dtypes=[self.dtype], @@ -374,16 +524,9 @@ def interp_na( keep_attrs=keep_attrs, ).transpose(*self.dims) - if limit is not None: - arr = arr.where(valids) - - if max_gap is not None: - if dim not in self.coords: - raise NotImplementedError( - "max_gap not implemented for unlabeled coordinates yet." - ) - nan_block_lengths = _get_nan_block_lengths(self, dim, index) - arr = arr.where(nan_block_lengths <= max_gap) + for m in masks: + if m is not None: + arr = arr.where(m) return arr @@ -535,20 +678,6 @@ def _get_interpolator_nd(method, **kwargs): return interp_class, kwargs -def _get_valid_fill_mask(arr, dim, limit): - """helper function to determine values that can be filled when limit is not - None""" - kw = {dim: limit + 1} - # we explicitly use construct method to avoid copy. - new_dim = utils.get_temp_dimname(arr.dims, "_window") - return ( - arr.isnull() - .rolling(min_periods=1, **kw) - .construct(new_dim, fill_value=False) - .sum(new_dim, skipna=False) - ) <= limit - - def _localize(var, indexes_coords): """Speed up for linear and nearest neighbor method. Only consider a subspace that is needed for the interpolation diff --git a/xarray/core/types.py b/xarray/core/types.py index 3eb97f86c4a..68cc9e07149 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -220,6 +220,8 @@ def copy( ] InterpolantOptions = Literal["barycentric", "krogh", "pchip", "spline", "akima"] InterpOptions = Union[Interp1dOptions, InterpolantOptions] +LimitDirectionOptions = Literal["forward", "backward", "both"] +LimitAreaOptions = Literal["inside", "outside"] DatetimeUnitOptions = Literal[ "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", None diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index bd75f633b82..ea46f9a11f5 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -11,6 +11,12 @@ NumpyInterpolator, ScipyInterpolator, SplineInterpolator, + _get_gap_dist_to_left_edge, + _get_gap_dist_to_right_edge, + _get_gap_left_edge, + _get_gap_right_edge, + _get_limit_area_mask, + _get_limit_fill_mask, _get_nan_block_lengths, get_clean_interp_index, ) @@ -108,12 +114,9 @@ def test_interpolate_pd_compat(method, fill_value) -> None: for dim in ["time", "x"]: actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) - # need limit_direction="both" here, to let pandas fill - # in both directions instead of default forward direction only expected = df.interpolate( method=method, axis=da.get_axis_num(dim), - limit_direction="both", fill_value=fill_value, ) @@ -189,6 +192,48 @@ def test_interpolate_pd_compat_polynomial(): np.testing.assert_allclose(actual.values, expected.values) +@requires_scipy +def test_interpolate_pd_compat_limits(): + shapes = [(7, 7)] + frac_nan = 0.5 + method = "slinear" # need slinear, since pandas does constant extrapolation for methods 'time', 'index', 'values' + limits = [ + None, + 1, + 3, + ] # pandas 2.1.4 is currently unable to handle coordinate based limits! + limit_directions = [ + "forward", + "backward", + ] # xarray does not support 'None' (pandas: None='forward', unless method='bfill') + limit_areas = [None, "outside", "inside"] + + for shape, limit, limit_direction, limit_area in itertools.product( + shapes, limits, limit_directions, limit_areas + ): + da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) + for dim in ["time", "x"]: + actual = da.interpolate_na( + method=method, + dim=dim, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + use_coordinate=True, + limit_use_coordinate=False, + fill_value="extrapolate", + ) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value="extrapolate", + ) + np.testing.assert_allclose(actual.values, expected.values) + + @requires_scipy def test_interpolate_unsorted_index_raises(): vals = np.array([1, 2, 3], dtype=np.float64) @@ -197,12 +242,6 @@ def test_interpolate_unsorted_index_raises(): expected.interpolate_na(dim="x", method="index") -def test_interpolate_no_dim_raises(): - da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims="x") - with pytest.raises(NotImplementedError, match=r"dim is a required argument"): - da.interpolate_na(method="linear") - - def test_interpolate_invalid_interpolator_raises(): da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims="x") with pytest.raises(ValueError, match=r"not a valid"): @@ -303,18 +342,57 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): - da = xr.DataArray( - np.array([1, 2, np.nan, np.nan, np.nan, 6], dtype=np.float64), dims="x" + n = np.nan + coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(8) * 2) + coords = {"yt": ("y", pd.Timestamp("2000-01-01") + coord_deltas)} + da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + + actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") + expected = da.copy(data=[n, n, 2, 3, 4, 5, 6, 7]) + assert_equal(actual, expected) + + actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") + expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.interpolate_na( + dim="y", + limit=pd.Timedelta("3H"), + limit_use_coordinate="yt", + fill_value="extrapolate", ) + expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + assert_equal(actual, expected) - actual = da.interpolate_na(dim="x", limit=None) - assert actual.isnull().sum() == 0 - actual = da.interpolate_na(dim="x", limit=2) - expected = xr.DataArray( - np.array([1, 2, 3, 4, np.nan, 6], dtype=np.float64), dims="x" +def test_interpolate_double_coordinate(): + # Check if limit is using 'limit_use_coordinate' and max_gap is using 'use_coordinate' + n = np.nan + da = xr.DataArray( + [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], + dims=["x", "y"], + coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, + ) + actual = da.interpolate_na( + "y", + limit=1, + max_gap=4, + limit_use_coordinate="y1", + use_coordinate="y2", + fill_value="extrapolate", ) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + assert_equal(actual, expected) + actual = da.interpolate_na( + "y", + limit=3, + max_gap=3, + limit_use_coordinate="y2", + use_coordinate="y1", + fill_value="extrapolate", + ) + expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) assert_equal(actual, expected) @@ -559,6 +637,114 @@ def test_bfill_dataset(ds): ds.ffill(dim="time") +def test_get_gap_left_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 3, 3, 3, 3, 3, 3, 3, 24], [n, n, n, 9, 9, 9, 18, 18, 18]] + ) + assert_equal(actual, expected) + + actual = _get_gap_left_edge(da, dim="y", index=index, outside=True) + expected = da.copy( + data=[[0, 3, 3, 3, 3, 3, 3, 3, 24], [0, 0, 0, 9, 9, 9, 18, 18, 18]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 2, 2, 2, 2, 2, 2, 2, 14], [n, n, n, 6, 6, 6, 10, 10, 10]] + ) + + +def test_get_gap_right_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[3, 3, 24, 24, 24, 24, 24, 24, 24], [9, 9, 9, 9, 18, 18, 18, n, n]] + ) + assert_equal(actual, expected) + + actual = _get_gap_right_edge(da, dim="y", index=index, outside=True) + expected = da.copy( + data=[[3, 3, 24, 24, 24, 24, 24, 24, 24], [9, 9, 9, 9, 18, 18, 18, 24, 24]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[2, 2, 14, 14, 14, 14, 14, 14, 14], [6, 6, 6, 6, 10, 10, 10, n, n]] + ) + + +def test_get_gap_dist_to_left_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_left_edge(da, dim="y", index=index) + expected = da.copy( + data=[[n, 0, 3, 6, 9, 12, 15, 18, 0], [n, n, n, 0, 3, 6, 0, 3, 6]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_left_edge(da, dim="y", index=index) + expected = da.copy(data=[[n, 0, 3, 4, 5, 6, 8, 10, 0], [n, n, n, 0, 1, 2, 0, 2, 4]]) + + +def test_get_gap_dist_to_right_edge(): + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + + y = np.arange(9) * 3 + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_right_edge(da, dim="y", index=index) + expected = da.copy( + data=[[3, 0, 18, 15, 12, 9, 6, 3, 0], [9, 6, 3, 0, 6, 3, 0, n, n]] + ) + assert_equal(actual, expected) + + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_gap_dist_to_right_edge(da, dim="y", index=index) + expected = da.copy(data=[[2, 0, 9, 8, 7, 6, 4, 2, 0], [5, 3, 0, 4, 3, 2, 0, n, n]]) + + @requires_bottleneck @pytest.mark.parametrize( "y, lengths_expected", @@ -586,6 +772,82 @@ def test_interpolate_na_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +def test_get_limit_fill_mask(): + T = True + F = False + n = np.nan + arr = [ + [n, 1, n, n, n, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + + with pytest.raises(ValueError, match=r"limit_direction must be one of"): + _get_limit_fill_mask(da, dim="y", index=index, limit=3, limit_direction="cat") + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="forward" + ) + expected = da.copy(data=[[F, T, T, F, F, F, F, F, T], [F, F, F, T, T, T, T, T, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="backward" + ) + expected = da.copy(data=[[T, T, F, F, F, F, F, T, T], [F, F, T, T, T, T, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=3, limit_direction="both" + ) + expected = da.copy(data=[[T, T, T, F, F, F, F, T, T], [F, F, T, T, T, T, T, T, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="forward" + ) + expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, F, T, T, F, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="backward" + ) + expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, F, F, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_fill_mask( + da, dim="y", index=index, limit=1, limit_direction="both" + ) + expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, T, F, T, F, F]]) + assert_equal(actual, expected) + + +def test_get_area_mask(): + T = True + F = False + n = np.nan + arr = [ + [n, 1, n, n, 5, n, n, n, 4], + [n, n, n, 1, n, n, 4, n, n], + ] + y = [0, 2, 5, 6, 7, 8, 10, 12, 14] + da = xr.DataArray(arr, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + + with pytest.raises(ValueError, match=r"limit_area must be one of"): + _get_limit_area_mask(da, dim="y", index=index, limit_area="cow") + + actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="inside") + expected = da.copy(data=[[F, T, T, T, T, T, T, T, T], [F, F, F, T, T, T, T, F, F]]) + assert_equal(actual, expected) + + actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="outside") + expected = da.copy(data=[[T, T, F, F, T, F, F, F, T], [T, T, T, T, F, F, T, T, T]]) + assert_equal(actual, expected) + + @requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_get_clean_interp_index_cf_calendar(cf_da, calendar): @@ -635,6 +897,31 @@ def test_get_clean_interp_index_strict(index): assert clean.dtype == np.float64 +def test_get_clean_interp_index_double_coordinate(): + da = xr.DataArray( + np.ones((2, 7)), + dims=["x", "y"], + coords={ + "x": ("x", [10, 20]), + "y1": ("y", np.arange(7) * 2), + "y2": ("y", np.arange(7) * 3), + }, + ) + with pytest.raises(ValueError, match=r"not a valid dimension"): + get_clean_interp_index(da, "y1", use_coordinate=True) + + actual = get_clean_interp_index(da, "y", use_coordinate=True) + expected = xr.Variable(["y"], np.arange(7)) + assert_equal(actual, expected) + + actual = get_clean_interp_index(da, "y", use_coordinate="y1") + expected = xr.Variable(["y"], np.arange(7) * 2) + assert_equal(actual, expected) + + with pytest.raises(ValueError, match=r"must have dimension"): + get_clean_interp_index(da, "x", use_coordinate="y1") + + @pytest.fixture def da_time(): return xr.DataArray( @@ -644,11 +931,6 @@ def da_time(): def test_interpolate_na_max_gap_errors(da_time): - with pytest.raises( - NotImplementedError, match=r"max_gap not implemented for unlabeled coordinates" - ): - da_time.interpolate_na("t", max_gap=1) - with pytest.raises(ValueError, match=r"max_gap must be a scalar."): da_time.interpolate_na("t", max_gap=(1,)) @@ -687,12 +969,16 @@ def test_interpolate_na_max_gap_time_specifier( @pytest.mark.parametrize( "coords", [ - pytest.param(None, marks=pytest.mark.xfail()), + None, {"x": np.arange(4), "y": np.arange(12)}, ], ) -def test_interpolate_na_2d(coords): +def test_interpolate_na_max_gap_2d(coords): n = np.nan + if coords is None: + use_coordinate = False + else: + use_coordinate = True da = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -704,7 +990,7 @@ def test_interpolate_na_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", max_gap=2) + actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2) expected_y = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], @@ -715,18 +1001,20 @@ def test_interpolate_na_2d(coords): ) assert_equal(actual, expected_y) - actual = da.interpolate_na("y", max_gap=1, fill_value="extrapolate") + actual = da.interpolate_na( + "y", use_coordinate=use_coordinate, max_gap=1, fill_value="extrapolate" + ) expected_y_extra = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], [n, n, 3, n, n, 6, n, n, n, 10, n, n], [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", max_gap=3) + actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3) expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -740,6 +1028,124 @@ def test_interpolate_na_2d(coords): assert_equal(actual, expected_x) +def test_interpolate_na_limit_2d(): + n = np.nan + coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(12) * 3) + coords = { + "x": np.arange(3) * 2, + "time": (pd.Timestamp("2000-01-01") + coord_deltas), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + actual = da.interpolate_na("time", limit=1, fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, n, n, 10, 11, 12], + [n, n, 3, 4, n, 6, 7, n, n, 10, 11, n], + [n, 2, 3, 4, 5, 6, 7, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + + actual = da.interpolate_na( + "time", limit=2, limit_direction="backward", fill_value="extrapolate" + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, n, n], + [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + actual = da.interpolate_na( + "time", + limit=pd.Timedelta("3H"), + limit_direction="backward", + limit_area="inside", + limit_use_coordinate=True, + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, n, 9, 10, 11, n], + [n, n, 3, n, 5, 6, n, n, 9, 10, n, n], + [n, 2, 3, 4, 5, 6, n, n, 9, 10, 11, n], + ] + ) + + actual = da.interpolate_na( + "time", + limit=pd.Timedelta("3H"), + limit_direction="backward", + limit_area="outside", + limit_use_coordinate=True, + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, 2, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + actual = da.interpolate_na( + "time", + limit=None, + limit_direction="backward", + limit_area="outside", + limit_use_coordinate=True, + fill_value=8, + ) + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [8, 8, 3, n, n, 6, n, n, n, 10, n, n], + [8, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ] + ) + assert_equal(actual, expected) + + da = xr.DataArray( + [ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, n], + [n, n, 3, 3, n, n], + [n, n, n, 4, 4, 4], + ], + dims=["x", "y"], + coords={"x": np.arange(4) * 2}, + ) + actual = da.interpolate_na( + method="linear", + dim="x", + limit=3, + limit_direction="forward", + limit_area=None, + limit_use_coordinate=True, + fill_value="extrapolate", + ) + expected = da.copy( + data=[ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, 2], + [n, 3, 3, 3, 3, n], + [n, n, 4, 4, 4, 4], + ] + ) + assert_equal(actual, expected) + + @requires_scipy def test_interpolators_complex_out_of_bounds(): """Ensure complex nans are used for complex data""" From 16cdf302974fceddf6b419ced4306434891dabcc Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:30:08 +0200 Subject: [PATCH 02/23] Use internal broadcasting and transpose instead of ones_like --- xarray/core/missing.py | 38 +++++++++++++++++++++--------------- xarray/tests/test_missing.py | 38 +++++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 2f8993d9705..d46dd9a4be4 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -from xarray.core.common import _contains_datetime_like_objects, ones_like +from xarray.core.common import _contains_datetime_like_objects from xarray.core.computation import apply_ufunc from xarray.core.duck_array_ops import ( datetime_to_numeric, @@ -38,8 +38,7 @@ def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): - arange = ones_like(obj) * index - left = arange.where(~obj.isnull()).ffill(dim) + left = index.where(~obj.isnull()).ffill(dim).transpose(*obj.dims) if outside: return left.fillna(index[0]) return left @@ -48,8 +47,7 @@ def _get_gap_left_edge( def _get_gap_right_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): - arange = ones_like(obj) * index - right = arange.where(~obj.isnull()).bfill(dim) + right = index.where(~obj.isnull()).bfill(dim).transpose(*obj.dims) if outside: return right.fillna(index[-1]) return right @@ -58,15 +56,13 @@ def _get_gap_right_edge( def _get_gap_dist_to_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable ): - arange = ones_like(obj) * index - return arange - _get_gap_left_edge(obj, dim, index) + return (index - _get_gap_left_edge(obj, dim, index)).transpose(*obj.dims) def _get_gap_dist_to_right_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable ): - arange = ones_like(obj) * index - return _get_gap_right_edge(obj, dim, index) - arange + return (_get_gap_right_edge(obj, dim, index) - index).transpose(*obj.dims) def _get_limit_fill_mask( @@ -174,22 +170,32 @@ def _get_gap_masks( raise TypeError( f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." ) + # Which masks are really needed? + need_limit_mask = limit != np.inf or limit_direction != "both" + need_area_mask = limit_area is not None + need_max_gap_mask = max_gap is not None # Calculate indexes - index_limit = get_clean_interp_index(obj, dim, use_coordinate=limit_use_coordinate) - index_max_gap = get_clean_interp_index( - obj, dim, use_coordinate=max_gap_use_coordinate - ) + if need_limit_mask or need_area_mask: + index_limit = get_clean_interp_index( + obj, dim, use_coordinate=limit_use_coordinate + ) + # index_limit = ones_like(obj) * index_limit + if need_max_gap_mask: + index_max_gap = get_clean_interp_index( + obj, dim, use_coordinate=max_gap_use_coordinate + ) + # index_max_gap = ones_like(obj) * index_max_gap # Calculate fill masks limit_mask = None - if limit != np.inf or limit_direction != "both": + if need_limit_mask: limit_mask = _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) limit_area_mask = None - if limit_area is not None: + if need_area_mask: limit_area_mask = _get_limit_area_mask(obj, dim, index_limit, limit_area) max_gap_mask = None - if max_gap is not None: + if need_max_gap_mask: max_gap_mask = _get_max_gap_mask(obj, dim, index_max_gap, max_gap) return limit_mask, limit_area_mask, max_gap_mask diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index ea46f9a11f5..1d06db31d3a 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -760,7 +760,7 @@ def test_get_gap_dist_to_right_edge(): ], ], ) -def test_interpolate_na_nan_block_lengths(y, lengths_expected): +def test_get_nan_block_lengths(y, lengths_expected): arr = [ [np.nan, 1, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 4], [np.nan, np.nan, np.nan, 1, np.nan, np.nan, 4, np.nan, np.nan], @@ -772,6 +772,42 @@ def test_interpolate_na_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +def test_get_nan_block_lengths_2d(): + n = np.nan + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + dims=["x", "y"], + coords={"x": np.arange(4), "y": np.arange(12) ** 2}, + ) + index = get_clean_interp_index(da, dim="y", use_coordinate=False) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected_y = da.copy( + data=[ + [0, 0, 0, 0, 2, 0, 4, 4, 4, 0, 0, 1], + [2, 2, 0, 3, 3, 0, 4, 4, 4, 0, 2, 2], + [2, 2, 0, 3, 3, 0, 4, 4, 4, 0, 2, 2], + [1, 0, 0, 0, 2, 0, 4, 4, 4, 0, 0, 1], + ] + ) + assert_equal(actual, expected_y) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected_y = da.copy( + data=[ + [0, 0, 0, 0, 16, 0, 56, 56, 56, 0, 0, 21], + [4, 4, 0, 21, 21, 0, 56, 56, 56, 0, 40, 40], + [4, 4, 0, 21, 21, 0, 56, 56, 56, 0, 40, 40], + [1, 0, 0, 0, 16, 0, 56, 56, 56, 0, 0, 21], + ] + ) + assert_equal(actual, expected_y) + + def test_get_limit_fill_mask(): T = True F = False From 43b716563d5e1aa2f32e176b7aea625ce284441d Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:30:17 +0200 Subject: [PATCH 03/23] Typo: Default False in doc for limit_use_coordinates --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e48a07cc883..f4c7a036860 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3565,7 +3565,7 @@ def interpolate_na( - "inside": Only fill NaNs surrounded by valid values (interpolate). - "outside": Only fill NaNs outside valid values (extrapolate). - limit_use_coordinate : bool or Hashable, default: True + limit_use_coordinate : bool or Hashable, default: False Specifies which index to use for the ``limit`` distance. - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 065594481bc..a045e96049e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6679,7 +6679,7 @@ def interpolate_na( - "inside": Only fill NaNs surrounded by valid values (interpolate). - "outside": Only fill NaNs outside valid values (extrapolate). - limit_use_coordinate : bool or Hashable, default: True + limit_use_coordinate : bool or Hashable, default: False Specifies which index to use for the ``limit`` distance. - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). From d46baa4cbed51f6d3af13c7c3db36ec27ba2fba1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 18:31:27 +0000 Subject: [PATCH 04/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index d46dd9a4be4..d1dffe7910b 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -475,13 +475,9 @@ def interp_na( dim: Hashable | None = None, method: InterpOptions = "linear", use_coordinate: bool | str = True, - limit: int - | float - | str - | pd.Timedelta - | np.timedelta64 - | dt.timedelta - | None = None, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, limit_direction: LimitDirectionOptions = "forward", limit_area: LimitAreaOptions | None = None, limit_use_coordinate: bool From 1fb77959d0dd3b14802af04633b99da3afe56ce9 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:57:54 +0200 Subject: [PATCH 05/23] Towards masked implementation --- xarray/core/missing.py | 52 +++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index d1dffe7910b..a24ced8c251 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -35,6 +35,37 @@ from xarray.core.dataset import Dataset +class MaskedDataArray: + def __init__(self, da: DataArray, mask: np.ndarray): + self.da = da + self.mask = mask + + +def mask_gaps( + self, + dim: Hashable | None = None, + use_coordinate: bool | str = True, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, + limit_direction: LimitDirectionOptions = "forward", + limit_area: LimitAreaOptions | None = None, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, +): + """Mask continues gaps in the data, providing functionality to control gap length and offsets""" + + masks = _get_gap_masks( + self, + dim, + limit, + limit_direction, + limit_area, + max_gap, + use_coordinate, + ) + return masks # tbd + + def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): @@ -73,12 +104,12 @@ def _get_limit_fill_mask( limit_direction, ): if limit_direction == "forward": - limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) <= limit + limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) > limit elif limit_direction == "backward": - limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) <= limit + limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) > limit elif limit_direction == "both": - limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) <= limit) | ( - _get_gap_dist_to_right_edge(obj, dim, index) <= limit + limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) > limit) & ( + _get_gap_dist_to_right_edge(obj, dim, index) > limit ) else: raise ValueError( @@ -92,16 +123,15 @@ def _get_limit_area_mask( ): if limit_area == "inside": area_mask = ( - _get_gap_left_edge(obj, dim, index).notnull() - & _get_gap_right_edge(obj, dim, index).notnull() + _get_gap_left_edge(obj, dim, index).isnull() + | _get_gap_right_edge(obj, dim, index).isnull() ) - area_mask = area_mask | obj.notnull() elif limit_area == "outside": area_mask = ( - _get_gap_left_edge(obj, dim, index).isnull() - | _get_gap_right_edge(obj, dim, index).isnull() + _get_gap_left_edge(obj, dim, index).notnull() + & _get_gap_right_edge(obj, dim, index).notnull() ) - area_mask = area_mask | obj.notnull() + area_mask = area_mask & obj.isnull() else: raise ValueError( f"limit_area must be one of 'inside', 'outside' or None. Got {limit_area}" @@ -128,7 +158,7 @@ def _get_max_gap_mask( max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta, ): nan_block_lengths = _get_nan_block_lengths(obj, dim, index) - return nan_block_lengths <= max_gap + return nan_block_lengths > max_gap def _get_gap_masks( From 878e6bb147403c1ea21a8c5bf1a613010c78e792 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Thu, 20 Jun 2024 21:04:16 +0200 Subject: [PATCH 06/23] Working fill_gaps implementation --- xarray/core/dataarray.py | 308 +++++++++++++++++++----------- xarray/core/dataset.py | 291 ++++++++++++++++++++-------- xarray/core/missing.py | 209 +++++++++++++-------- xarray/tests/test_missing.py | 354 +++++++++++++++++++++++++---------- 4 files changed, 795 insertions(+), 367 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f4c7a036860..092a48cf2b6 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -95,6 +95,7 @@ from xarray.core.groupby import DataArrayGroupBy from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling + from xarray.core.missing import GapMask from xarray.core.types import ( CoarsenBoundaryOptions, DatetimeLike, @@ -3476,6 +3477,7 @@ def fillna(self, value: Any) -> Self: out = ops.fillna(self, value) return out + def interpolate_na( self, dim: Hashable, @@ -3490,9 +3492,6 @@ def interpolate_na( | np.timedelta64 | datetime.timedelta ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - limit_use_coordinate: bool | Hashable = False, max_gap: ( None | int @@ -3516,62 +3515,25 @@ def interpolate_na( String indicating which method to use for interpolation: - 'linear': linear interpolation. Additional keyword - arguments are passed to :py:func:`numpy.interp` + arguments are passed to :py:func:`numpy.interp` - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': - are passed to :py:func:`scipy.interpolate.interp1d`. If - ``method='polynomial'``, the ``order`` keyword argument must also be - provided. + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be + provided. - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their - respective :py:class:`scipy.interpolate` classes. + respective :py:class:`scipy.interpolate` classes. - use_coordinate : bool or Hashable, default: True + use_coordinate : bool or str, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - - limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None - Maximum number or distance of consecutive NaNs to fill. - Use None for no limit. When interpolating along a datetime64 dimension - and ``limit_use_coordinate=True``, ``limit`` can be one of the following: - - - a string that is valid input for pandas.to_timedelta - - a :py:class:`numpy.timedelta64` object - - a :py:class:`pandas.Timedelta` object - - a :py:class:`datetime.timedelta` object - - Otherwise, ``limit`` must be an int or a float. - If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined - as the difference between the coordinate at a NaN value and the coordinate of the next valid value - to the left (right for ``limit_direction=backward``). - For example, consider:: - - - array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 - - For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. - To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. If False, values are treated as if + equally-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a + coordinate variable to use as the index. + limit : int or None, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "forward" - Consecutive NaNs will be filled in this direction. - limit_area: {"inside", "outside"} or None: default: None - Consecutive NaNs will be filled with this restriction. - - - None: No fill restriction. - - "inside": Only fill NaNs surrounded by valid values (interpolate). - - "outside": Only fill NaNs outside valid values (extrapolate). - - limit_use_coordinate : bool or Hashable, default: False - Specifies which index to use for the ``limit`` distance. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3582,8 +3544,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer - index is created. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -3592,7 +3554,7 @@ def interpolate_na( array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 + * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively keep_attrs : bool or None, default: None @@ -3607,78 +3569,47 @@ def interpolate_na( interpolated: DataArray Filled in DataArray. - Warning - -------- - When passing fill_value as a keyword argument with method="linear", it does not use - ``numpy.interp`` but it uses ``scipy.interpolate.interp1d``, which provides the fill_value parameter. - See Also -------- numpy.interp scipy.interpolate - pandas.DataFrame.interpolate - - Notes - ----- - ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. Examples -------- >>> da = xr.DataArray( - ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], - ... dims="x", - ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} ... ) >>> da - - array([nan, 2., nan, nan, 5., nan, 0.]) + Size: 40B + array([nan, 2., 3., nan, 0.]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 + >>> da.interpolate_na(dim="x", method="linear") - - array([nan, 2. , 3. , 4. , 5. , 2.5, 0. ]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 - >>> da.interpolate_na( - ... dim="x", - ... method="linear", - ... limit_direction="both", - ... fill_value="extrapolate", - ... ) - - array([1. , 2. , 3. , 4. , 5. , 2.5, 0. ]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 - >>> da.interpolate_na( - ... dim="x", method="linear", limit=1, limit_direction="forward" - ... ) - - array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Size: 40B + array([nan, 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 - >>> da.interpolate_na( - ... dim="x", method="linear", max_gap=2, limit_direction="forward" - ... ) - - array([nan, 2. , nan, nan, 5. , 2.5, 0. ]) + * x (x) int64 40B 0 1 2 3 4 + + >>> da.interpolate_na(dim="x", method="linear", fill_value="extrapolate") + Size: 40B + array([1. , 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 """ from xarray.core.missing import interp_na return interp_na( - self, - dim=dim, - method=method, - use_coordinate=use_coordinate, - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - limit_use_coordinate=limit_use_coordinate, - max_gap=max_gap, - keep_attrs=keep_attrs, - **kwargs, - ) + self, + dim=dim, + method=method, + limit=limit, + use_coordinate=use_coordinate, + max_gap=max_gap, + keep_attrs=keep_attrs, + **kwargs, + ) + def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward @@ -3847,6 +3778,161 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: from xarray.core.missing import bfill return bfill(self, dim, limit=limit) + + def fill_gaps( + self, + dim: Hashable, + use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions | None = None, + max_gap: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + ) -> GapMask: + """Fill in gaps in the data using one of several filling methods. + Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + + Parameters + ---------- + dim : Hashable + Specifies the dimension along which to calculate gap sizes. + use_coordinate : bool or Hashable, default: True + Specifies which index to use when calculating gap sizes. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only fill gaps less than a given length, + see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values + - "outside": Only fill NaNs outside valid values (extrapolate). + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When calculated along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last valid value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. + + Returns + ------- + Gap Mask: GapMask + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. + + See Also + -------- + DataArray.fillna + DataArray.ffill + DataArray.bfill + DataArray.interpolate_na + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + + Examples + -------- + >>> da = xr.DataArray( + ... [np.nan, 2, np.nan, np.nan, 5, np.nan, 0], + ... dims="x", + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... ) + >>> da + Size: 56B + array([nan, 2., nan, nan, 5., nan, 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps( + ... dim="x", limit=1, limit_direction="forward" + ... ).interpolate_na(dim="x") + Size: 56B + array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps( + ... dim="x", max_gap=2, limit_direction="forward" + ... ).ffill(dim="x") + Size: 56B + array([nan, 2., nan, nan, 5., 5., 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps( + ... dim="x", limit_area="inside" + ... ).fillna(9) + Size: 56B + array([nan, 2., 9., 9., 5., 9., 0.]) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + """ + from xarray.core.missing import mask_gaps + + return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) + def combine_first(self, other: Self) -> Self: """Combine two DataArray objects, with union of coordinates. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a045e96049e..f55869d8257 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6623,7 +6623,7 @@ def interpolate_na( Parameters ---------- - dim : Hashable + dim : Hashable or None, optional Specifies the dimension along which to interpolate. method : {"linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial", \ "barycentric", "krogh", "pchip", "spline", "akima"}, default: "linear" @@ -6640,52 +6640,15 @@ def interpolate_na( use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation - formulated as `y = f(x)`. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - - limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None - Maximum number or distance of consecutive NaNs to fill. - Use None for no limit. When interpolating along a datetime64 dimension - and ``limit_use_coordinate=True``, ``limit`` can be one of the following: - - - a string that is valid input for pandas.to_timedelta - - a :py:class:`numpy.timedelta64` object - - a :py:class:`pandas.Timedelta` object - - a :py:class:`datetime.timedelta` object - - Otherwise, ``limit`` must be an int or a float. - If ``limit_use_coordinates=True``, for ``limit_direction=forward`` distance is defined - as the difference between the coordinate at a NaN value and the coordinate of the next valid value - to the left (right for ``limit_direction=backward``). - For example, consider:: - - - array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) - Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 - - For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. - To only interpolate over gaps less than a given length, + formulated as `y = f(x)`. If False, values are treated as if + equally-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a + coordinate variable to use as the index. + limit : int, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, see ``max_gap``. - limit_direction: {"forward", "backward", "both"}, default: "forward" - Consecutive NaNs will be filled in this direction. - limit_area: {"inside", "outside"} or None: default: None - Consecutive NaNs will be filled with this restriction. - - - None: No fill restriction. - - "inside": Only fill NaNs surrounded by valid values (interpolate). - - "outside": Only fill NaNs outside valid values (extrapolate). - - limit_use_coordinate : bool or Hashable, default: False - Specifies which index to use for the ``limit`` distance. - - - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). - - True: the IndexVariable `dim` is used. - - String: specifies the name of a coordinate variable to use as the index. - max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta \ or None, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. @@ -6697,8 +6660,8 @@ def interpolate_na( - a :py:class:`pandas.Timedelta` object - a :py:class:`datetime.timedelta` object - Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer - index is created. Gap length is defined as the difference + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference between coordinate values at the first data point after a gap and the last value before a gap. For gaps at the beginning (end), gap length is defined as the difference between coordinate values at the first (last) valid data point and the first (last) NaN. @@ -6710,10 +6673,6 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively - keep_attrs : bool or None, default: None - If True, the dataarray's attributes (`attrs`) will be copied from - the original object to the new one. If False, the new - object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6732,50 +6691,49 @@ def interpolate_na( numpy.interp scipy.interpolate - Notes - ----- - ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. - Examples -------- >>> ds = xr.Dataset( ... { - ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), - ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), + ... "A": ("x", [np.nan, 2, 3, np.nan, 0]), + ... "B": ("x", [3, 4, np.nan, 1, 7]), + ... "C": ("x", [np.nan, np.nan, np.nan, 5, 0]), + ... "D": ("x", [np.nan, 3, np.nan, -1, 4]), ... }, - ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... coords={"x": [0, 1, 2, 3, 4]}, ... ) >>> ds - - Dimensions: (x: 7) + Size: 200B + Dimensions: (x: 5) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 Data variables: - A (x) float64 nan 2.0 nan nan 5.0 nan 0.0 - B (x) float64 nan 2.0 nan nan 5.0 6.0 nan - >>> ds.interpolate_na( - ... dim="x", - ... method="linear", - ... limit_direction="both", - ... fill_value="extrapolate", - ... ) - - Dimensions: (x: 7) + A (x) float64 40B nan 2.0 3.0 nan 0.0 + B (x) float64 40B 3.0 4.0 nan 1.0 7.0 + C (x) float64 40B nan nan nan 5.0 0.0 + D (x) float64 40B nan 3.0 nan -1.0 4.0 + + >>> ds.interpolate_na(dim="x", method="linear") + Size: 200B + Dimensions: (x: 5) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 Data variables: - A (x) float64 1.0 2.0 3.0 4.0 5.0 2.5 0.0 - B (x) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 - >>> ds.interpolate_na( - ... dim="x", method="linear", limit=1, limit_direction="forward" - ... ) - - Dimensions: (x: 7) + A (x) float64 40B nan 2.0 3.0 1.5 0.0 + B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 + C (x) float64 40B nan nan nan 5.0 0.0 + D (x) float64 40B nan 3.0 1.0 -1.0 4.0 + + >>> ds.interpolate_na(dim="x", method="linear", fill_value="extrapolate") + Size: 200B + Dimensions: (x: 5) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 + * x (x) int64 40B 0 1 2 3 4 Data variables: - A (x) float64 nan 2.0 3.0 nan 5.0 2.5 0.0 - B (x) float64 nan 2.0 3.0 nan 5.0 6.0 nan + A (x) float64 40B 1.0 2.0 3.0 1.5 0.0 + B (x) float64 40B 3.0 4.0 2.5 1.0 7.0 + C (x) float64 40B 20.0 15.0 10.0 5.0 0.0 + D (x) float64 40B 5.0 3.0 1.0 -1.0 4.0 """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na @@ -6920,6 +6878,173 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: new = _apply_over_vars_with_dim(bfill, self, dim=dim, limit=limit) return new + def fill_gaps( + self, + dim: Hashable, + use_coordinate: bool | Hashable = True, + limit: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions | None = None, + max_gap: ( + None + | int + | float + | str + | pd.Timedelta + | np.timedelta64 + | datetime.timedelta + ) = None, + ) -> GapMask: + """Fill in gaps in the data using one of several filling methods. + Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + + Parameters + ---------- + dim : Hashable + Specifies the dimension along which to calculate gap sizes. + use_coordinate : bool or Hashable, default: True + Specifies which index to use when calculating gap sizes. + + - False: a consecutive integer index is created along ``dim`` (0, 1, 2, ...). + - True: the IndexVariable `dim` is used. + - String: specifies the name of a coordinate variable to use as the index. + + limit : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum number or distance of consecutive NaNs to fill. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``limit`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``limit`` must be an int or a float. + If ``use_coordinates=True``, for ``limit_direction=forward`` distance is defined + as the difference between the coordinate at a NaN value and the coordinate of the next valid value + to the left (right for ``limit_direction=backward``). + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + For ``limit_direction=forward``, distances are ``[nan, nan, nan, 0, 1, 2, 0, 1, 2]``. + To only fill gaps less than a given length, + see ``max_gap``. + limit_direction: {"forward", "backward", "both"}, default: "forward" + Consecutive NaNs will be filled in this direction. + limit_area: {"inside", "outside"} or None: default: None + Consecutive NaNs will be filled with this restriction. + + - None: No fill restriction. + - "inside": Only fill NaNs surrounded by valid values + - "outside": Only fill NaNs outside valid values (extrapolate). + max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When calculated along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + - a :py:class:`datetime.timedelta` object + + Otherwise, ``max_gap`` must be an int or a float. If ``use_coordinate=False``, a linear integer + index is created. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last valid value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. + + Returns + ------- + Gap Mask: GapMask + An object where all remaining gaps are masked. Unmasked values can be filled by calling any of the provided methods. + + See Also + -------- + DataArray.fillna + DataArray.ffill + DataArray.bfill + DataArray.interpolate_na + pandas.DataFrame.interpolate + + Notes + ----- + ``Limit`` and ``max_gap`` have different effects on gaps: If ``limit`` is set, *some* values in a gap will be filled (up to the given distance from the boundaries). ``max_gap`` will prevent *any* filling for gaps larger than the given distance. + + Examples + -------- + >>> ds = xr.Dataset( + ... { + ... "A": ("x", [np.nan, 2, np.nan, np.nan, 5, np.nan, 0]), + ... "B": ("x", [np.nan, 2, np.nan, np.nan, 5, 6, np.nan]), + ... }, + ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, + ... ) + >>> ds + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 + B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan + >>> ds.fill_gaps( + ... dim="x", limit=1, limit_direction="forward" + ... ).interpolate_na(dim="x") + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 + B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan + >>> ds.fill_gaps( + ... dim="x", max_gap=2, limit_direction="forward" + ... ).ffill(dim="x") + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 + B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 + >>> ds.fill_gaps( + ... dim="x", limit_area="inside" + ... ).fillna(9) + Size: 168B + Dimensions: (x: 7) + Coordinates: + * x (x) int64 56B 0 1 2 3 4 5 6 + Data variables: + A (x) float64 56B nan 2.0 9.0 9.0 5.0 9.0 0.0 + B (x) float64 56B nan 2.0 9.0 9.0 5.0 6.0 nan + """ + from xarray.core.missing import mask_gaps + + return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) def combine_first(self, other: Self) -> Self: """Combine two Datasets, default to data_vars of self. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index a24ced8c251..5e348478193 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -35,36 +35,23 @@ from xarray.core.dataset import Dataset -class MaskedDataArray: - def __init__(self, da: DataArray, mask: np.ndarray): - self.da = da - self.mask = mask - - -def mask_gaps( - self, - dim: Hashable | None = None, - use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, -): - """Mask continues gaps in the data, providing functionality to control gap length and offsets""" - - masks = _get_gap_masks( - self, - dim, - limit, - limit_direction, - limit_area, - max_gap, - use_coordinate, - ) - return masks # tbd - +_FILL_MISSING_DOCSTRING_TEMPLATE = """\ +Partly fill nan values in this object's data by applying `{name}` to all unmasked values. + +Parameters +---------- +keep_attrs : bool, default: None + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False, the new object will be returned + without attributes. If None uses the global default. +**kwargs : dict + Additional keyword arguments passed on to `{name}`. + +Returns +------- +filled : same type as caller + New object with `{name}` applied to all unmasked values. +""" def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False @@ -103,13 +90,15 @@ def _get_limit_fill_mask( limit, limit_direction, ): + #At the left boundary, distance to left is nan. + #For nan, a<=b and ~(a>b) behave differently if limit_direction == "forward": - limit_mask = _get_gap_dist_to_left_edge(obj, dim, index) > limit + limit_mask = ~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit) elif limit_direction == "backward": - limit_mask = _get_gap_dist_to_right_edge(obj, dim, index) > limit + limit_mask = ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) elif limit_direction == "both": - limit_mask = (_get_gap_dist_to_left_edge(obj, dim, index) > limit) & ( - _get_gap_dist_to_right_edge(obj, dim, index) > limit + limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & (~( + _get_gap_dist_to_right_edge(obj, dim, index) <= limit) ) else: raise ValueError( @@ -161,7 +150,7 @@ def _get_max_gap_mask( return nan_block_lengths > max_gap -def _get_gap_masks( +def _get_gap_mask( obj: Dataset | DataArray | Variable, dim: Hashable, limit=None, @@ -215,20 +204,24 @@ def _get_gap_masks( obj, dim, use_coordinate=max_gap_use_coordinate ) # index_max_gap = ones_like(obj) * index_max_gap - # Calculate fill masks - limit_mask = None + if not (need_limit_mask or need_area_mask or need_max_gap_mask): + return None + + # Calculate individual masks + masks=[] if need_limit_mask: - limit_mask = _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) + masks.append(_get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction)) - limit_area_mask = None if need_area_mask: - limit_area_mask = _get_limit_area_mask(obj, dim, index_limit, limit_area) + masks.append(_get_limit_area_mask(obj, dim, index_limit, limit_area)) - max_gap_mask = None if need_max_gap_mask: - max_gap_mask = _get_max_gap_mask(obj, dim, index_max_gap, max_gap) - return limit_mask, limit_area_mask, max_gap_mask - + masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) + #Combine masks + mask=masks[0] + for m in masks[1:]: + mask|=m + return mask class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods""" @@ -499,41 +492,15 @@ def _is_time_index(index): return isinstance(index, (pd.DatetimeIndex, CFTimeIndex)) - -def interp_na( +def _interp_na_all( self, dim: Hashable | None = None, method: InterpOptions = "linear", use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - limit_use_coordinate: bool - | str = False, # backward compatibility + pandas (2.1.4) compatibility - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None = None, keep_attrs: bool | None = None, **kwargs, ): - """Interpolate values according to different methods.""" - - # Preprocess arguments and do consistency checks - if dim is None: - raise NotImplementedError("dim is a required argument") - - masks = _get_gap_masks( - self, - dim, - limit, - limit_direction, - limit_area, - limit_use_coordinate, - max_gap, - use_coordinate, - ) - - # method + """Interpolate all nan values, without restrictions regarding the gap size.""" index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) interpolator = partial(func_interpolate_na, interp_class, **kwargs) @@ -555,14 +522,106 @@ def interp_na( vectorize=True, keep_attrs=keep_attrs, ).transpose(*self.dims) + return arr + +class GapMask: + """An object that allows for flexible masking of gaps.""" + def __init__(self, content: DataArray | Dataset, mask: np.ndarray): + self.content = content + self.mask = mask + + def _fill_method(name: str, _fill_function: Callable | None = None): + def method(self, *args, _fill_function=_fill_function, **kwargs): + if _fill_function is None: + _fill_function=getattr(self.content, name) + filled=_fill_function(*args, **kwargs) + else: + filled=_fill_function(self.content, *args, **kwargs) + + if self.mask is not None: + filled=filled.where(~self.mask, other=self.content) + return filled + method.__name__ = name + method.__doc__ = _FILL_MISSING_DOCSTRING_TEMPLATE.format(name=name) + return method - for m in masks: - if m is not None: - arr = arr.where(m) + ffill=_fill_method('ffill') + bfill=_fill_method('bfill') + fillna=_fill_method('fillna') + interpolate_na=_fill_method('interpolate_na') +def mask_gaps( + self, + dim: Hashable | None = None, + use_coordinate: bool | str = True, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, + limit_direction: LimitDirectionOptions ="both", + limit_area: LimitAreaOptions | None = None, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, +) -> GapMask: + """Mask continuous gaps in the data, providing functionality to control gap length and offsets""" + + mask = _get_gap_mask( + self, + dim, + limit, + limit_direction, + limit_area, + use_coordinate, + max_gap, + use_coordinate, + ) + return GapMask(self, mask) + + + + +def interp_na( + self, + dim: Hashable | None = None, + method: InterpOptions = "linear", + use_coordinate: bool | str = True, + limit: ( + int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None + ) = None, + max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, + keep_attrs: bool | None = None, + **kwargs, +): + """Interpolate values according to different methods.""" + # Preprocess arguments and do consistency checks + if dim is None: + raise NotImplementedError("dim is a required argument") + + #This was the original behaviour of interp_na and is kept for backward compatibility + #Limit=None: Fill everything, including both boundaries + #Limit!=None: Do forward interpolation until limit + limit_use_coordinate=False + if limit is None: + limit_direction = "both" + else: + limit_direction = "forward" + limit_area = None + mask = _get_gap_mask( + self, + dim, + limit, + limit_direction, + limit_area, + limit_use_coordinate, + max_gap, + use_coordinate, + ) + + arr=_interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) + if mask is not None: + arr = arr.where(~mask) return arr + def func_interpolate_na(interpolator, y, x, **kwargs): """helper function to apply interpolation along 1 dimension""" # reversed arguments are so that attrs are preserved from da, not index diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 1d06db31d3a..f75ef470b07 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -104,34 +104,33 @@ def make_interpolate_example_data(shape, frac_nan, seed=12345, non_uniform=False @pytest.mark.parametrize( "method", ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"] ) +@pytest.mark.parametrize("dim", ["time", "x"]) +@pytest.mark.parametrize("shape", [(8, 8), (1, 20), (20, 1), (100, 100)]) +@pytest.mark.parametrize("frac_nan", [0, 0.5, 1]) @requires_scipy -def test_interpolate_pd_compat(method, fill_value) -> None: - shapes = [(8, 8), (1, 20), (20, 1), (100, 100)] - frac_nans = [0, 0.5, 1] +def test_interpolate_pd_compat(method, fill_value, dim, shape, frac_nan) -> None: - for shape, frac_nan in itertools.product(shapes, frac_nans): - da, df = make_interpolate_example_data(shape, frac_nan) + da, df = make_interpolate_example_data(shape, frac_nan) - for dim in ["time", "x"]: - actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) - expected = df.interpolate( - method=method, - axis=da.get_axis_num(dim), - fill_value=fill_value, - ) - - if method == "linear": - # Note, Pandas does not take left/right fill_value into account - # for the numpy linear methods. - # see https://github.com/pandas-dev/pandas/issues/55144 - # This aligns the pandas output with the xarray output - fixed = expected.values.copy() - fixed[pd.isnull(actual.values)] = np.nan - fixed[actual.values == fill_value] = fill_value - else: - fixed = expected.values + actual = da.interpolate_na(method=method, dim=dim, fill_value=fill_value) + expected = df.interpolate( + method=method, + axis=da.get_axis_num(dim), + fill_value=fill_value, + limit_direction='both' + ) - np.testing.assert_allclose(actual.values, fixed) + if method == "linear": + # Note, Pandas does not take left/right fill_value into account + # for the numpy linear methods. + # see https://github.com/pandas-dev/pandas/issues/55144 + # This aligns the pandas output with the xarray output + fixed = expected.values.copy() + fixed[pd.isnull(actual.values)] = np.nan + fixed[actual.values == fill_value] = fill_value + else: + fixed = expected.values + np.testing.assert_allclose(actual.values, fixed) @requires_scipy @@ -213,14 +212,16 @@ def test_interpolate_pd_compat_limits(): ): da, df = make_interpolate_example_data(shape, frac_nan, non_uniform=True) for dim in ["time", "x"]: - actual = da.interpolate_na( - method=method, + actual = da.fill_gaps( dim=dim, limit=limit, limit_direction=limit_direction, limit_area=limit_area, + use_coordinate=False, + ).interpolate_na( + dim=dim, + method=method, use_coordinate=True, - limit_use_coordinate=False, fill_value="extrapolate", ) expected = df.interpolate( @@ -343,30 +344,30 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): n = np.nan - coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(8) * 2) - coords = {"yt": ("y", pd.Timestamp("2000-01-01") + coord_deltas)} - da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + times=pd.date_range("2000-01-01", periods=9, freq="2h") + coords = {"yt": ("y", times)} + da = xr.DataArray([n,n,3, n, n, 6, n, 8, n], dims=["y"], coords=coords) actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") - expected = da.copy(data=[n, n, 2, 3, 4, 5, 6, 7]) + #With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility + expected = da.copy(data=[1, 2, 3, 4, 5, 6, 7, 8, 9]) assert_equal(actual, expected) - actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") - expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + actual = da.interpolate_na(dim="y", limit=None, max_gap=2, fill_value="extrapolate") + expected = da.copy(data=[1, 2, 3, n, n, 6, 7, 8, 9]) assert_equal(actual, expected) - actual = da.interpolate_na( - dim="y", - limit=pd.Timedelta("3H"), - limit_use_coordinate="yt", - fill_value="extrapolate", - ) - expected = da.copy(data=[n, n, 2, 3, n, 5, 6, n]) + actual = da.interpolate_na(dim="y", limit=1, fill_value="extrapolate") + expected = da.copy(data=[n, n, 3, 4, n, 6, 7, 8, 9]) assert_equal(actual, expected) + actual = da.interpolate_na(dim="y", limit=1, max_gap=2, fill_value="extrapolate") + expected = da.copy(data=[n, n, 3, n, n, 6, 7, 8, 9]) + assert_equal(actual, expected) def test_interpolate_double_coordinate(): - # Check if limit is using 'limit_use_coordinate' and max_gap is using 'use_coordinate' + # Check if max_gap is able to handle string coordinate names + # Limit is always refering to an index n = np.nan da = xr.DataArray( [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], @@ -374,25 +375,22 @@ def test_interpolate_double_coordinate(): coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, ) actual = da.interpolate_na( - "y", + dim="y", limit=1, max_gap=4, - limit_use_coordinate="y1", - use_coordinate="y2", - fill_value="extrapolate", - ) - expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + use_coordinate="y1", + fill_value="extrapolate") + expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, 2, n, n, 5, 6, n]]) assert_equal(actual, expected) actual = da.interpolate_na( "y", - limit=3, - max_gap=3, - limit_use_coordinate="y2", - use_coordinate="y1", + limit=2, + max_gap=4, + use_coordinate="y2", fill_value="extrapolate", ) - expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, 7]]) assert_equal(actual, expected) @@ -826,37 +824,37 @@ def test_get_limit_fill_mask(): actual = _get_limit_fill_mask( da, dim="y", index=index, limit=3, limit_direction="forward" ) - expected = da.copy(data=[[F, T, T, F, F, F, F, F, T], [F, F, F, T, T, T, T, T, F]]) + expected = da.copy(data=[[T, F, F, T, T, T, T, T, F], [T, T, T, F, F, F, F, F, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=3, limit_direction="backward" ) - expected = da.copy(data=[[T, T, F, F, F, F, F, T, T], [F, F, T, T, T, T, T, F, F]]) + expected = da.copy(data=[[F, F, T, T, T, T, T, F, F], [T, T, F, F, F, F, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=3, limit_direction="both" ) - expected = da.copy(data=[[T, T, T, F, F, F, F, T, T], [F, F, T, T, T, T, T, T, F]]) + expected = da.copy(data=[[F, F, F, T, T, T, T, F, F], [T, T, F, F, F, F, F, F, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=1, limit_direction="forward" ) - expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, F, T, T, F, T, F, F]]) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, T, F, F, T, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=1, limit_direction="backward" ) - expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, F, F, T, F, F]]) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, F, F, T, T, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_fill_mask( da, dim="y", index=index, limit=1, limit_direction="both" ) - expected = da.copy(data=[[F, T, F, F, F, F, F, F, T], [F, F, T, T, T, F, T, F, F]]) + expected = da.copy(data=[[T, F, T, T, T, T, T, T, F], [T, T, F, F, F, T, F, T, T]]) assert_equal(actual, expected) @@ -876,11 +874,11 @@ def test_get_area_mask(): _get_limit_area_mask(da, dim="y", index=index, limit_area="cow") actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="inside") - expected = da.copy(data=[[F, T, T, T, T, T, T, T, T], [F, F, F, T, T, T, T, F, F]]) + expected = da.copy(data=[[T, F, F, F, F, F, F, F, F], [T, T, T, F, F, F, F, T, T]]) assert_equal(actual, expected) actual = _get_limit_area_mask(da, dim="y", index=index, limit_area="outside") - expected = da.copy(data=[[T, T, F, F, T, F, F, F, T], [T, T, T, T, F, F, T, T, T]]) + expected = da.copy(data=[[F, F, T, T, F, T, T, T, F], [F, F, F, F, T, T, F, F, F]]) assert_equal(actual, expected) @@ -1026,13 +1024,13 @@ def test_interpolate_na_max_gap_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2) + actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2, fill_value='extrapolate') expected_y = da.copy( data=[ - [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], - [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, 2, 3, 4, 5, 6, n, n, n, 10, 11, n], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y) @@ -1045,12 +1043,12 @@ def test_interpolate_na_max_gap_2d(coords): [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], [n, n, 3, n, n, 6, n, n, n, 10, n, n], [n, n, 3, n, n, 6, n, n, n, 10, n, n], - [n, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], ] ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3) + actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate") expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -1063,13 +1061,12 @@ def test_interpolate_na_max_gap_2d(coords): ) assert_equal(actual, expected_x) - def test_interpolate_na_limit_2d(): n = np.nan - coord_deltas = pd.TimedeltaIndex(unit="H", data=np.arange(12) * 3) + times=pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, - "time": (pd.Timestamp("2000-01-01") + coord_deltas), + "time": (times), } da = xr.DataArray( [ @@ -1090,8 +1087,105 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) - actual = da.interpolate_na( - "time", limit=2, limit_direction="backward", fill_value="extrapolate" +@requires_scipy +def test_interpolators_complex_out_of_bounds(): + """Ensure complex nans are used for complex data""" + + xi = np.array([-1, 0, 1, 2, 5], dtype=np.float64) + yi = np.exp(1j * xi) + x = np.array([-2, 1, 6], dtype=np.float64) + + expected = np.array( + [np.nan + np.nan * 1j, np.exp(1j), np.nan + np.nan * 1j], dtype=yi.dtype + ) + + for method, interpolator in [ + ("linear", NumpyInterpolator), + ("linear", ScipyInterpolator), + ]: + f = interpolator(xi, yi, method=method) + actual = f(x) + assert_array_equal(actual, expected) + +####Masking Functionality +def test_fill_gaps_limit(): + n = np.nan + times=pd.date_range("2000-01-01", periods=8, freq="2h") + coords = {"yt": ("y", times)} + da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) + + actual = da.fill_gaps(dim='y', limit=None).interpolate_na(dim="y", fill_value="extrapolate") + expected = da.copy(data=[0, 1, 2, 3, 4, 5, 6, 7]) + assert_equal(actual, expected) + + actual = da.fill_gaps(dim='y', limit=1).interpolate_na(dim="y", fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps( + dim="y", + limit=pd.Timedelta("3h"), + use_coordinate="yt", + ).interpolate_na(dim='y', fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) + assert_equal(actual, expected) + + actual = da.fill_gaps( + dim="y", + limit=pd.Timedelta("3h"), + limit_direction="backward", + use_coordinate="yt", + ).interpolate_na(dim='y', fill_value="extrapolate") + expected = da.copy(data=[n, 1, 2, n, 4, 5, n, n]) + assert_equal(actual, expected) + +def test_mask_gap_limit_2d(): + n = np.nan + times=pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) + + mask = da.fill_gaps('time', limit=1, use_coordinate=False) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], + [n, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, n], + [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + actual=mask.ffill(dim="time") + expected = da.copy( + data=[ + [1, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], + [n, n, 3, 3, 3, 6, 6, n, 6, 10, 10, n], + [n, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], + ] + ) + assert_equal(actual, expected) + actual=mask.fillna(0) + expected = da.copy( + data=[ + [1, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], + [n, 0, 3, 0, 0, 6, 0, n, 0, 10, 0, n], + [0, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], + ] + ) + assert_equal(actual, expected) + + actual = da.fill_gaps('time', limit=2, use_coordinate=False, limit_direction='backward').interpolate_na( + "time", fill_value="extrapolate" ) expected = da.copy( data=[ @@ -1102,12 +1196,14 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) - actual = da.interpolate_na( + actual = da.fill_gaps( "time", - limit=pd.Timedelta("3H"), + limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="inside", - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'time', fill_value="extrapolate", ) expected = da.copy( @@ -1118,12 +1214,14 @@ def test_interpolate_na_limit_2d(): ] ) - actual = da.interpolate_na( + actual = da.fill_gaps( "time", - limit=pd.Timedelta("3H"), + limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="outside", - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'time', fill_value="extrapolate", ) expected = da.copy( @@ -1135,12 +1233,14 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) - actual = da.interpolate_na( + actual = da.fill_gaps( "time", limit=None, limit_direction="backward", limit_area="outside", - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'time', fill_value=8, ) expected = da.copy( @@ -1162,14 +1262,16 @@ def test_interpolate_na_limit_2d(): dims=["x", "y"], coords={"x": np.arange(4) * 2}, ) - actual = da.interpolate_na( - method="linear", + actual = da.fill_gaps( dim="x", limit=3, limit_direction="forward", limit_area=None, - limit_use_coordinate=True, + use_coordinate=True + ).interpolate_na( + 'x', fill_value="extrapolate", + method="linear", ) expected = da.copy( data=[ @@ -1181,23 +1283,79 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) +def test_mask_gap_max_gap_2d(): + n = np.nan + times=pd.date_range("2000-01-01", periods=12, freq="3h") + coords = { + "x": np.arange(3) * 2, + "time": (times), + } + da = xr.DataArray( + [ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [n, 2, 3, 4, n, 6, n, n, n, 10, 11, n], + ], + coords=coords, + ) -@requires_scipy -def test_interpolators_complex_out_of_bounds(): - """Ensure complex nans are used for complex data""" + mask = da.fill_gaps('time', max_gap=1, use_coordinate=False) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) + mask = da.fill_gaps('time', max_gap=2, use_coordinate=False) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + [1, 2, 3, n, n, 6, n, n, n, 10, 11, 12], + [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) - xi = np.array([-1, 0, 1, 2, 5], dtype=np.float64) - yi = np.exp(1j * xi) - x = np.array([-2, 1, 6], dtype=np.float64) + mask = da.fill_gaps('time', max_gap=pd.Timedelta("3h"), use_coordinate=True) + actual=mask.interpolate_na("time", fill_value="extrapolate") + expected = da.copy( + data=[ + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + [n, n, 3, n, n, 6, n, n, n, 10, n, n], + [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], + ] + ) + assert_equal(actual, expected) - expected = np.array( - [np.nan + np.nan * 1j, np.exp(1j), np.nan + np.nan * 1j], dtype=yi.dtype +def test_mask_double_coordinate(): + # Check if limit and max_gap are able to handle string coordinate names + n = np.nan + da = xr.DataArray( + [[1, n, n, 4, n, 6, 7], [1, n, n, n, 5, n, n]], + dims=["x", "y"], + coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, ) + actual = da.fill_gaps( + "y", + limit=1, + max_gap=4, + use_coordinate="y1", + ).interpolate_na("y", fill_value="extrapolate") + expected = da.copy(data=[[1, 2, 3, 4, 5, 6, 7], [1, 2, n, 4, 5, 6, n]]) + assert_equal(actual, expected) - for method, interpolator in [ - ("linear", NumpyInterpolator), - ("linear", ScipyInterpolator), - ]: - f = interpolator(xi, yi, method=method) - actual = f(x) - assert_array_equal(actual, expected) + actual = da.fill_gaps( + "y", + limit=2, + max_gap=4, + use_coordinate="y2" + ).interpolate_na( + "y", + fill_value="extrapolate", + ) + expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) + assert_equal(actual, expected) \ No newline at end of file From 1ac5e9c51983e917311d93761782b68855349ef7 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 23 Aug 2024 17:01:20 +0200 Subject: [PATCH 07/23] Remove keep_attrs from docstring of filling functions --- xarray/core/missing.py | 70 ++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 5e348478193..b8b51ea72ee 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -40,10 +40,6 @@ Parameters ---------- -keep_attrs : bool, default: None - If True, the attributes (``attrs``) will be copied from the original - object to the new one. If False, the new object will be returned - without attributes. If None uses the global default. **kwargs : dict Additional keyword arguments passed on to `{name}`. @@ -53,6 +49,7 @@ New object with `{name}` applied to all unmasked values. """ + def _get_gap_left_edge( obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False ): @@ -90,15 +87,15 @@ def _get_limit_fill_mask( limit, limit_direction, ): - #At the left boundary, distance to left is nan. - #For nan, a<=b and ~(a>b) behave differently + # At the left boundary, distance to left is nan. + # For nan, a<=b and ~(a>b) behave differently if limit_direction == "forward": limit_mask = ~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit) elif limit_direction == "backward": limit_mask = ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) elif limit_direction == "both": - limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & (~( - _get_gap_dist_to_right_edge(obj, dim, index) <= limit) + limit_mask = (~(_get_gap_dist_to_left_edge(obj, dim, index) <= limit)) & ( + ~(_get_gap_dist_to_right_edge(obj, dim, index) <= limit) ) else: raise ValueError( @@ -169,7 +166,7 @@ def _get_gap_mask( limit = np.inf else: if limit_use_coordinate is False: - if not isinstance(limit, (Number, np.number)): + if not isinstance(limit, Number | np.number): raise TypeError( f"Expected integer or floating point limit since limit_use_coordinate=False. Received {type(limit).__name__}." ) @@ -185,7 +182,7 @@ def _get_gap_mask( max_gap = timedelta_to_numeric(max_gap) if not max_gap_use_coordinate: - if not isinstance(max_gap, (Number, np.number)): + if not isinstance(max_gap, Number | np.number): raise TypeError( f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." ) @@ -208,21 +205,24 @@ def _get_gap_mask( return None # Calculate individual masks - masks=[] + masks = [] if need_limit_mask: - masks.append(_get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction)) + masks.append( + _get_limit_fill_mask(obj, dim, index_limit, limit, limit_direction) + ) if need_area_mask: masks.append(_get_limit_area_mask(obj, dim, index_limit, limit_area)) if need_max_gap_mask: masks.append(_get_max_gap_mask(obj, dim, index_max_gap, max_gap)) - #Combine masks - mask=masks[0] + # Combine masks + mask = masks[0] for m in masks[1:]: - mask|=m + mask |= m return mask + class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods""" @@ -490,7 +490,8 @@ def get_clean_interp_index( def _is_time_index(index): from xarray.coding.cftimeindex import CFTimeIndex - return isinstance(index, (pd.DatetimeIndex, CFTimeIndex)) + return isinstance(index, pd.DatetimeIndex | CFTimeIndex) + def _interp_na_all( self, @@ -524,31 +525,35 @@ def _interp_na_all( ).transpose(*self.dims) return arr + class GapMask: """An object that allows for flexible masking of gaps.""" + def __init__(self, content: DataArray | Dataset, mask: np.ndarray): self.content = content self.mask = mask - + def _fill_method(name: str, _fill_function: Callable | None = None): def method(self, *args, _fill_function=_fill_function, **kwargs): if _fill_function is None: - _fill_function=getattr(self.content, name) - filled=_fill_function(*args, **kwargs) + _fill_function = getattr(self.content, name) + filled = _fill_function(*args, **kwargs) else: - filled=_fill_function(self.content, *args, **kwargs) + filled = _fill_function(self.content, *args, **kwargs) if self.mask is not None: - filled=filled.where(~self.mask, other=self.content) + filled = filled.where(~self.mask, other=self.content) return filled + method.__name__ = name method.__doc__ = _FILL_MISSING_DOCSTRING_TEMPLATE.format(name=name) return method - ffill=_fill_method('ffill') - bfill=_fill_method('bfill') - fillna=_fill_method('fillna') - interpolate_na=_fill_method('interpolate_na') + ffill = _fill_method("ffill") + bfill = _fill_method("bfill") + fillna = _fill_method("fillna") + interpolate_na = _fill_method("interpolate_na") + def mask_gaps( self, @@ -557,7 +562,7 @@ def mask_gaps( limit: ( int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None ) = None, - limit_direction: LimitDirectionOptions ="both", + limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, ) -> GapMask: @@ -576,8 +581,6 @@ def mask_gaps( return GapMask(self, mask) - - def interp_na( self, dim: Hashable | None = None, @@ -595,10 +598,10 @@ def interp_na( if dim is None: raise NotImplementedError("dim is a required argument") - #This was the original behaviour of interp_na and is kept for backward compatibility - #Limit=None: Fill everything, including both boundaries - #Limit!=None: Do forward interpolation until limit - limit_use_coordinate=False + # This was the original behaviour of interp_na and is kept for backward compatibility + # Limit=None: Fill everything, including both boundaries + # Limit!=None: Do forward interpolation until limit + limit_use_coordinate = False if limit is None: limit_direction = "both" else: @@ -615,13 +618,12 @@ def interp_na( use_coordinate, ) - arr=_interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) + arr = _interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) if mask is not None: arr = arr.where(~mask) return arr - def func_interpolate_na(interpolator, y, x, **kwargs): """helper function to apply interpolation along 1 dimension""" # reversed arguments are so that attrs are preserved from da, not index From 97b00a476844a49fa1130fb250d4b89e3ce6b944 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:11:22 +0200 Subject: [PATCH 08/23] Fix typos, undo empty spaces, remove temporarily introduced arguments --- xarray/core/dataarray.py | 91 ++++++++++++++++++---------------------- xarray/core/dataset.py | 60 +++++++++++++------------- xarray/core/missing.py | 2 +- 3 files changed, 71 insertions(+), 82 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 092a48cf2b6..d87a43ff477 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -93,9 +93,9 @@ from xarray.backends import ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.groupby import DataArrayGroupBy + from xarray.core.missing import GapMask from xarray.core.resample import DataArrayResample from xarray.core.rolling import DataArrayCoarsen, DataArrayRolling - from xarray.core.missing import GapMask from xarray.core.types import ( CoarsenBoundaryOptions, DatetimeLike, @@ -3477,21 +3477,12 @@ def fillna(self, value: Any) -> Self: out = ops.fillna(self, value) return out - def interpolate_na( self, dim: Hashable, method: InterpOptions = "linear", + limit: int | None = None, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, max_gap: ( None | int @@ -3515,25 +3506,25 @@ def interpolate_na( String indicating which method to use for interpolation: - 'linear': linear interpolation. Additional keyword - arguments are passed to :py:func:`numpy.interp` + arguments are passed to :py:func:`numpy.interp` - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': - are passed to :py:func:`scipy.interpolate.interp1d`. If - ``method='polynomial'``, the ``order`` keyword argument must also be - provided. + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be + provided. - 'barycentric', 'krogh', 'pchip', 'spline', 'akima': use their - respective :py:class:`scipy.interpolate` classes. + respective :py:class:`scipy.interpolate` classes. + limit : int or None, default: None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. use_coordinate : bool or str, default: True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if equally-spaced along ``dim``. If True, the IndexVariable `dim` is used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variable to use as the index. - limit : int or None, default: None - Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. This filling is done regardless of the size of - the gap in the data. To only interpolate over gaps less than a given length, - see ``max_gap``. max_gap : int, float, str, pandas.Timedelta, numpy.timedelta64, datetime.timedelta, default: None Maximum size of gap, a continuous sequence of NaNs, that will be filled. Use None for no limit. When interpolating along a datetime64 dimension @@ -3554,7 +3545,7 @@ def interpolate_na( array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) Coordinates: - * x (x) int64 0 1 2 3 4 5 6 7 8 + * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively keep_attrs : bool or None, default: None @@ -3583,33 +3574,32 @@ def interpolate_na( Size: 40B array([nan, 2., 3., nan, 0.]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 40B 0 1 2 3 4 >>> da.interpolate_na(dim="x", method="linear") Size: 40B array([nan, 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 40B 0 1 2 3 4 >>> da.interpolate_na(dim="x", method="linear", fill_value="extrapolate") Size: 40B array([1. , 2. , 3. , 1.5, 0. ]) Coordinates: - * x (x) int64 40B 0 1 2 3 4 + * x (x) int64 40B 0 1 2 3 4 """ from xarray.core.missing import interp_na return interp_na( - self, - dim=dim, - method=method, - limit=limit, - use_coordinate=use_coordinate, - max_gap=max_gap, - keep_attrs=keep_attrs, - **kwargs, - ) - + self, + dim=dim, + method=method, + limit=limit, + use_coordinate=use_coordinate, + max_gap=max_gap, + keep_attrs=keep_attrs, + **kwargs, + ) def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward @@ -3778,7 +3768,7 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: from xarray.core.missing import bfill return bfill(self, dim, limit=limit) - + def fill_gaps( self, dim: Hashable, @@ -3873,10 +3863,6 @@ def fill_gaps( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively - keep_attrs : bool or None, default: None - If True, the dataarray's attributes (`attrs`) will be copied from - the original object to the new one. If False, the new - object will be returned without attributes. Returns ------- @@ -3907,23 +3893,19 @@ def fill_gaps( array([nan, 2., nan, nan, 5., nan, 0.]) Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps( - ... dim="x", limit=1, limit_direction="forward" - ... ).interpolate_na(dim="x") + >>> da.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( + ... dim="x" + ... ) Size: 56B array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps( - ... dim="x", max_gap=2, limit_direction="forward" - ... ).ffill(dim="x") + >>> da.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 56B array([nan, 2., nan, nan, 5., 5., 0.]) Coordinates: * x (x) int64 56B 0 1 2 3 4 5 6 - >>> da.fill_gaps( - ... dim="x", limit_area="inside" - ... ).fillna(9) + >>> da.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 56B array([nan, 2., 9., 9., 5., 9., 0.]) Coordinates: @@ -3931,8 +3913,15 @@ def fill_gaps( """ from xarray.core.missing import mask_gaps - return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) - + return mask_gaps( + self, + dim, + use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + max_gap=max_gap, + ) def combine_first(self, other: Self) -> Self: """Combine two DataArray objects, with union of coordinates. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f55869d8257..310ea3a3f34 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -140,6 +140,7 @@ from xarray.core.dataarray import DataArray from xarray.core.groupby import DatasetGroupBy from xarray.core.merge import CoercibleMapping, CoercibleValue, _MergeResult + from xarray.core.missing import GapMask from xarray.core.resample import DatasetResample from xarray.core.rolling import DatasetCoarsen, DatasetRolling from xarray.core.types import ( @@ -6594,19 +6595,8 @@ def interpolate_na( self, dim: Hashable, method: InterpOptions = "linear", + limit: int | None = None, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, - limit_direction: LimitDirectionOptions = "forward", - limit_area: LimitAreaOptions | None = None, - limit_use_coordinate: bool | Hashable = False, max_gap: ( int | float @@ -6673,6 +6663,10 @@ def interpolate_na( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + keep_attrs : bool or None, default: None + If True, the dataarray's attributes (`attrs`) will be copied from + the original object to the new one. If False, the new + object will be returned without attributes. **kwargs : dict, optional parameters passed verbatim to the underlying interpolation function @@ -6737,6 +6731,9 @@ def interpolate_na( """ from xarray.core.missing import _apply_over_vars_with_dim, interp_na + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + new = _apply_over_vars_with_dim( interp_na, self, @@ -6745,8 +6742,10 @@ def interpolate_na( limit=limit, use_coordinate=use_coordinate, max_gap=max_gap, + keep_attrs=keep_attrs, **kwargs, ) + new.attrs = self.attrs if keep_attrs else None return new def ffill(self, dim: Hashable, limit: int | None = None) -> Self: @@ -6972,10 +6971,6 @@ def fill_gaps( * x (x) int64 0 1 2 3 4 5 6 7 8 The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively - keep_attrs : bool or None, default: None - If True, the dataarray's attributes (`attrs`) will be copied from - the original object to the new one. If False, the new - object will be returned without attributes. Returns ------- @@ -6984,10 +6979,10 @@ def fill_gaps( See Also -------- - DataArray.fillna - DataArray.ffill - DataArray.bfill - DataArray.interpolate_na + Dataset.fillna + Dataset.ffill + Dataset.bfill + Dataset.interpolate_na pandas.DataFrame.interpolate Notes @@ -7011,9 +7006,9 @@ def fill_gaps( Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan - >>> ds.fill_gaps( - ... dim="x", limit=1, limit_direction="forward" - ... ).interpolate_na(dim="x") + >>> ds.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( + ... dim="x" + ... ) Size: 168B Dimensions: (x: 7) Coordinates: @@ -7021,9 +7016,7 @@ def fill_gaps( Data variables: A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan - >>> ds.fill_gaps( - ... dim="x", max_gap=2, limit_direction="forward" - ... ).ffill(dim="x") + >>> ds.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 168B Dimensions: (x: 7) Coordinates: @@ -7031,9 +7024,7 @@ def fill_gaps( Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 - >>> ds.fill_gaps( - ... dim="x", limit_area="inside" - ... ).fillna(9) + >>> ds.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 168B Dimensions: (x: 7) Coordinates: @@ -7044,7 +7035,16 @@ def fill_gaps( """ from xarray.core.missing import mask_gaps - return mask_gaps(self, dim, use_coordinate=use_coordinate, limit=limit, limit_direction=limit_direction, limit_area=limit_area, max_gap=max_gap) + return mask_gaps( + self, + dim, + use_coordinate=use_coordinate, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + max_gap=max_gap, + ) + def combine_first(self, other: Self) -> Self: """Combine two Datasets, default to data_vars of self. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index b8b51ea72ee..3cf7d303962 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -450,7 +450,7 @@ def get_clean_interp_index( from xarray.coding.cftimeindex import CFTimeIndex index = _get_raw_interp_index(arr, dim, use_coordinate) - # index.name is None for multiindexes + # TODO: index.name is None for multiindexes # set name for nice error messages below if isinstance(index, pd.MultiIndex): index.name = dim From 1626489d4ae08449880f6c3605ba43f0f05d9805 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:22:12 +0000 Subject: [PATCH 09/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_missing.py | 101 +++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index f75ef470b07..371bf38e3c6 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -117,7 +117,7 @@ def test_interpolate_pd_compat(method, fill_value, dim, shape, frac_nan) -> None method=method, axis=da.get_axis_num(dim), fill_value=fill_value, - limit_direction='both' + limit_direction="both", ) if method == "linear": @@ -344,12 +344,12 @@ def test_interp1d_fastrack(method, vals): @requires_bottleneck def test_interpolate_limits(): n = np.nan - times=pd.date_range("2000-01-01", periods=9, freq="2h") + times = pd.date_range("2000-01-01", periods=9, freq="2h") coords = {"yt": ("y", times)} - da = xr.DataArray([n,n,3, n, n, 6, n, 8, n], dims=["y"], coords=coords) + da = xr.DataArray([n, n, 3, n, n, 6, n, 8, n], dims=["y"], coords=coords) actual = da.interpolate_na(dim="y", limit=None, fill_value="extrapolate") - #With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility + # With no limit, everything should be interpolated. Introduced in xarray due to a bug (GH7665), but kept for backward compatibility expected = da.copy(data=[1, 2, 3, 4, 5, 6, 7, 8, 9]) assert_equal(actual, expected) @@ -365,6 +365,7 @@ def test_interpolate_limits(): expected = da.copy(data=[n, n, 3, n, n, 6, 7, 8, 9]) assert_equal(actual, expected) + def test_interpolate_double_coordinate(): # Check if max_gap is able to handle string coordinate names # Limit is always refering to an index @@ -375,11 +376,8 @@ def test_interpolate_double_coordinate(): coords={"y1": ("y", np.arange(7)), "y2": ("y", np.arange(7) * 2)}, ) actual = da.interpolate_na( - dim="y", - limit=1, - max_gap=4, - use_coordinate="y1", - fill_value="extrapolate") + dim="y", limit=1, max_gap=4, use_coordinate="y1", fill_value="extrapolate" + ) expected = da.copy(data=[[1, 2, n, 4, 5, 6, 7], [1, 2, n, n, 5, 6, n]]) assert_equal(actual, expected) @@ -1024,7 +1022,9 @@ def test_interpolate_na_max_gap_2d(coords): coords=coords, ) - actual = da.interpolate_na("y", use_coordinate=use_coordinate, max_gap=2, fill_value='extrapolate') + actual = da.interpolate_na( + "y", use_coordinate=use_coordinate, max_gap=2, fill_value="extrapolate" + ) expected_y = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], @@ -1048,7 +1048,9 @@ def test_interpolate_na_max_gap_2d(coords): ) assert_equal(actual, expected_y_extra) - actual = da.interpolate_na("x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate") + actual = da.interpolate_na( + "x", use_coordinate=use_coordinate, max_gap=3, fill_value="extrapolate" + ) expected_x = xr.DataArray( [ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, n], @@ -1061,9 +1063,10 @@ def test_interpolate_na_max_gap_2d(coords): ) assert_equal(actual, expected_x) + def test_interpolate_na_limit_2d(): n = np.nan - times=pd.date_range("2000-01-01", periods=12, freq="3h") + times = pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, "time": (times), @@ -1087,6 +1090,7 @@ def test_interpolate_na_limit_2d(): ) assert_equal(actual, expected) + @requires_scipy def test_interpolators_complex_out_of_bounds(): """Ensure complex nans are used for complex data""" @@ -1107,18 +1111,23 @@ def test_interpolators_complex_out_of_bounds(): actual = f(x) assert_array_equal(actual, expected) + ####Masking Functionality def test_fill_gaps_limit(): n = np.nan - times=pd.date_range("2000-01-01", periods=8, freq="2h") + times = pd.date_range("2000-01-01", periods=8, freq="2h") coords = {"yt": ("y", times)} da = xr.DataArray([n, n, 2, n, n, 5, n, n], dims=["y"], coords=coords) - actual = da.fill_gaps(dim='y', limit=None).interpolate_na(dim="y", fill_value="extrapolate") + actual = da.fill_gaps(dim="y", limit=None).interpolate_na( + dim="y", fill_value="extrapolate" + ) expected = da.copy(data=[0, 1, 2, 3, 4, 5, 6, 7]) assert_equal(actual, expected) - actual = da.fill_gaps(dim='y', limit=1).interpolate_na(dim="y", fill_value="extrapolate") + actual = da.fill_gaps(dim="y", limit=1).interpolate_na( + dim="y", fill_value="extrapolate" + ) expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) assert_equal(actual, expected) @@ -1126,7 +1135,7 @@ def test_fill_gaps_limit(): dim="y", limit=pd.Timedelta("3h"), use_coordinate="yt", - ).interpolate_na(dim='y', fill_value="extrapolate") + ).interpolate_na(dim="y", fill_value="extrapolate") expected = da.copy(data=[n, 1, 2, 3, 4, 5, 6, n]) assert_equal(actual, expected) @@ -1135,13 +1144,14 @@ def test_fill_gaps_limit(): limit=pd.Timedelta("3h"), limit_direction="backward", use_coordinate="yt", - ).interpolate_na(dim='y', fill_value="extrapolate") + ).interpolate_na(dim="y", fill_value="extrapolate") expected = da.copy(data=[n, 1, 2, n, 4, 5, n, n]) assert_equal(actual, expected) + def test_mask_gap_limit_2d(): n = np.nan - times=pd.date_range("2000-01-01", periods=12, freq="3h") + times = pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, "time": (times), @@ -1155,8 +1165,8 @@ def test_mask_gap_limit_2d(): coords=coords, ) - mask = da.fill_gaps('time', limit=1, use_coordinate=False) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", limit=1, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, 5, 6, 7, n, 9, 10, 11, 12], @@ -1165,7 +1175,7 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) - actual=mask.ffill(dim="time") + actual = mask.ffill(dim="time") expected = da.copy( data=[ [1, 2, 3, 4, 4, 6, 6, n, 6, 10, 11, 11], @@ -1174,7 +1184,7 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) - actual=mask.fillna(0) + actual = mask.fillna(0) expected = da.copy( data=[ [1, 2, 3, 4, 0, 6, 0, n, 0, 10, 11, 0], @@ -1184,9 +1194,9 @@ def test_mask_gap_limit_2d(): ) assert_equal(actual, expected) - actual = da.fill_gaps('time', limit=2, use_coordinate=False, limit_direction='backward').interpolate_na( - "time", fill_value="extrapolate" - ) + actual = da.fill_gaps( + "time", limit=2, use_coordinate=False, limit_direction="backward" + ).interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, 8, 9, 10, 11, n], @@ -1201,9 +1211,9 @@ def test_mask_gap_limit_2d(): limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="inside", - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'time', + "time", fill_value="extrapolate", ) expected = da.copy( @@ -1219,9 +1229,9 @@ def test_mask_gap_limit_2d(): limit=pd.Timedelta("3h"), limit_direction="backward", limit_area="outside", - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'time', + "time", fill_value="extrapolate", ) expected = da.copy( @@ -1238,9 +1248,9 @@ def test_mask_gap_limit_2d(): limit=None, limit_direction="backward", limit_area="outside", - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'time', + "time", fill_value=8, ) expected = da.copy( @@ -1267,9 +1277,9 @@ def test_mask_gap_limit_2d(): limit=3, limit_direction="forward", limit_area=None, - use_coordinate=True + use_coordinate=True, ).interpolate_na( - 'x', + "x", fill_value="extrapolate", method="linear", ) @@ -1283,9 +1293,10 @@ def test_mask_gap_limit_2d(): ) assert_equal(actual, expected) + def test_mask_gap_max_gap_2d(): n = np.nan - times=pd.date_range("2000-01-01", periods=12, freq="3h") + times = pd.date_range("2000-01-01", periods=12, freq="3h") coords = { "x": np.arange(3) * 2, "time": (times), @@ -1299,8 +1310,8 @@ def test_mask_gap_max_gap_2d(): coords=coords, ) - mask = da.fill_gaps('time', max_gap=1, use_coordinate=False) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", max_gap=1, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], @@ -1309,8 +1320,8 @@ def test_mask_gap_max_gap_2d(): ] ) assert_equal(actual, expected) - mask = da.fill_gaps('time', max_gap=2, use_coordinate=False) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", max_gap=2, use_coordinate=False) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, 5, 6, n, n, n, 10, 11, 12], @@ -1320,8 +1331,8 @@ def test_mask_gap_max_gap_2d(): ) assert_equal(actual, expected) - mask = da.fill_gaps('time', max_gap=pd.Timedelta("3h"), use_coordinate=True) - actual=mask.interpolate_na("time", fill_value="extrapolate") + mask = da.fill_gaps("time", max_gap=pd.Timedelta("3h"), use_coordinate=True) + actual = mask.interpolate_na("time", fill_value="extrapolate") expected = da.copy( data=[ [1, 2, 3, 4, n, 6, n, n, n, 10, 11, 12], @@ -1331,6 +1342,7 @@ def test_mask_gap_max_gap_2d(): ) assert_equal(actual, expected) + def test_mask_double_coordinate(): # Check if limit and max_gap are able to handle string coordinate names n = np.nan @@ -1348,14 +1360,9 @@ def test_mask_double_coordinate(): expected = da.copy(data=[[1, 2, 3, 4, 5, 6, 7], [1, 2, n, 4, 5, 6, n]]) assert_equal(actual, expected) - actual = da.fill_gaps( - "y", - limit=2, - max_gap=4, - use_coordinate="y2" - ).interpolate_na( + actual = da.fill_gaps("y", limit=2, max_gap=4, use_coordinate="y2").interpolate_na( "y", fill_value="extrapolate", ) expected = da.copy(data=[[1, n, n, 4, 5, 6, 7], [1, n, n, n, 5, 6, n]]) - assert_equal(actual, expected) \ No newline at end of file + assert_equal(actual, expected) From b3d70d67ba00ee37145b11f8b562ac81d491dd8a Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:13:01 +0200 Subject: [PATCH 10/23] Add line break for readability --- xarray/core/dataarray.py | 13 +++++++++---- xarray/core/dataset.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d87a43ff477..fe5b71f5382 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3570,6 +3570,7 @@ def interpolate_na( >>> da = xr.DataArray( ... [np.nan, 2, 3, np.nan, 0], dims="x", coords={"x": [0, 1, 2, 3, 4]} ... ) + >>> da Size: 40B array([nan, 2., 3., nan, 0.]) @@ -3888,28 +3889,32 @@ def fill_gaps( ... dims="x", ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) + >>> da Size: 56B array([nan, 2., nan, nan, 5., nan, 0.]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( ... dim="x" ... ) Size: 56B array([nan, 2. , 3. , nan, 5. , 2.5, 0. ]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 56B array([nan, 2., nan, nan, 5., 5., 0.]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 + >>> da.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 56B array([nan, 2., 9., 9., 5., 9., 0.]) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 """ from xarray.core.missing import mask_gaps diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 310ea3a3f34..9c5c1e39daa 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6696,6 +6696,7 @@ def interpolate_na( ... }, ... coords={"x": [0, 1, 2, 3, 4]}, ... ) + >>> ds Size: 200B Dimensions: (x: 5) @@ -6998,37 +6999,41 @@ def fill_gaps( ... }, ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) + >>> ds Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 nan 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 nan + >>> ds.fill_gaps(dim="x", limit=1, limit_direction="forward").interpolate_na( ... dim="x" ... ) Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 3.0 nan 5.0 2.5 0.0 B (x) float64 56B nan 2.0 3.0 nan 5.0 6.0 nan + >>> ds.fill_gaps(dim="x", max_gap=2, limit_direction="forward").ffill(dim="x") Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 nan nan 5.0 5.0 0.0 B (x) float64 56B nan 2.0 nan nan 5.0 6.0 6.0 + >>> ds.fill_gaps(dim="x", limit_area="inside").fillna(9) Size: 168B Dimensions: (x: 7) Coordinates: - * x (x) int64 56B 0 1 2 3 4 5 6 + * x (x) int64 56B 0 1 2 3 4 5 6 Data variables: A (x) float64 56B nan 2.0 9.0 9.0 5.0 9.0 0.0 B (x) float64 56B nan 2.0 9.0 9.0 5.0 6.0 nan From 6b4c0f72eb7d36a66aef09235d9f1485aa809a71 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:47:58 +0200 Subject: [PATCH 11/23] Enforce kwargs to be passed by name --- xarray/core/dataarray.py | 1 + xarray/core/dataset.py | 1 + 2 files changed, 2 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index fe5b71f5382..030a29f36c9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3773,6 +3773,7 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: def fill_gaps( self, dim: Hashable, + *, use_coordinate: bool | Hashable = True, limit: ( None diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9c5c1e39daa..596b14f4a13 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6881,6 +6881,7 @@ def bfill(self, dim: Hashable, limit: int | None = None) -> Self: def fill_gaps( self, dim: Hashable, + *, use_coordinate: bool | Hashable = True, limit: ( None From 84fe728f10ecf2e8025c18a6bc492a5c3df3d042 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sat, 24 Aug 2024 22:16:23 +0200 Subject: [PATCH 12/23] Keep_Attrs: Default to True --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 596b14f4a13..01596290389 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6733,7 +6733,7 @@ def interpolate_na( from xarray.core.missing import _apply_over_vars_with_dim, interp_na if keep_attrs is None: - keep_attrs = _get_keep_attrs(default=False) + keep_attrs = _get_keep_attrs(default=True) new = _apply_over_vars_with_dim( interp_na, From 3bbd6da8520f7b7df145b6ca01f6def16d4461c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 24 Aug 2024 20:17:16 +0000 Subject: [PATCH 13/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 01596290389..8a5d4c0fcae 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6696,7 +6696,7 @@ def interpolate_na( ... }, ... coords={"x": [0, 1, 2, 3, 4]}, ... ) - + >>> ds Size: 200B Dimensions: (x: 5) @@ -7000,7 +7000,7 @@ def fill_gaps( ... }, ... coords={"x": [0, 1, 2, 3, 4, 5, 6]}, ... ) - + >>> ds Size: 168B Dimensions: (x: 7) From 3ec34bf1d24ec3cbfd4ea14ae414533b72b1f58b Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 16:06:41 +0200 Subject: [PATCH 14/23] Explicitly add fill functions in GapMask object - Allows for more flexibility, e.g. optional dim arguments - Better static typing --- xarray/core/missing.py | 132 ++++++++++++++++++++++++++++------- xarray/tests/test_missing.py | 19 ++++- 2 files changed, 124 insertions(+), 27 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 3cf7d303962..1c46facce1f 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -529,35 +529,117 @@ def _interp_na_all( class GapMask: """An object that allows for flexible masking of gaps.""" - def __init__(self, content: DataArray | Dataset, mask: np.ndarray): + def __init__(self, content: DataArray | Dataset, mask: np.ndarray, dim: Hashable): self.content = content self.mask = mask - - def _fill_method(name: str, _fill_function: Callable | None = None): - def method(self, *args, _fill_function=_fill_function, **kwargs): - if _fill_function is None: - _fill_function = getattr(self.content, name) - filled = _fill_function(*args, **kwargs) - else: - filled = _fill_function(self.content, *args, **kwargs) - - if self.mask is not None: - filled = filled.where(~self.mask, other=self.content) - return filled - - method.__name__ = name - method.__doc__ = _FILL_MISSING_DOCSTRING_TEMPLATE.format(name=name) - return method - - ffill = _fill_method("ffill") - bfill = _fill_method("bfill") - fillna = _fill_method("fillna") - interpolate_na = _fill_method("interpolate_na") - + self.dim=dim + + def _apply_mask(self, filled): + if self.mask is not None: + filled = filled.where(~self.mask, other=self.content) + return filled + + def ffill(self, dim: Hashable | None = None): + """Partly fill missing values in this object's data by applying ffill to all unmasked values. + + Parameters + ---------- + dim : Hashable or None, default None + Dimension along which to fill missing values. If None, the dimension used to create the mask is used. + + Returns + ------- + filled : same type as caller + New object with ffill applied to all unmasked values. + + See Also + -------- + DataArray.ffill + Dataset.ffill + """ + if dim is None: + dim = self.dim + return self._apply_mask(self.content.ffill(dim)) + + def bfill(self, dim: Hashable | None = None): + """Partly fill missing values in this object's data by applying bfill to all unmasked values. + + Parameters + ---------- + dim : Hashable or None, default None + Dimension along which to fill missing values. If None, the dimension used to create the mask is used. + + Returns + ------- + filled : same type as caller + New object with bfill applied to all unmasked values. + + See Also + -------- + DataArray.bfill + Dataset.bfill + """ + if dim is None: + dim = self.dim + return self._apply_mask(self.content.bfill(dim)) + + def fillna(self, value): + """Partly fill missing values in this object's data by applying fillna to all unmasked values. + + Parameters + ---------- + value : scalar, ndarray or DataArray + Used to fill all unmasked values. If the + argument is a DataArray, it is first aligned with (reindexed to) + this array. + + Returns + ------- + filled : same type as caller + New object with fillna applied to all unmasked values. + + See Also + -------- + DataArray.fillna + Dataset.fillna + """ + return self._apply_mask(self.content.fillna(value)) + + def interpolate_na( + self, + dim: Hashable | None = None, + method: InterpOptions = "linear", + use_coordinate: bool | str = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ): + """Partly fill missing values in this object's data by applying interpolate_na to all unmasked values. + + Parameters + ---------- + See DataArray.interpolate_na and Dataset.interpolate_na for explanation of parameters. + + Returns + ------- + filled : same type as caller + New object with interpolate_na applied to all unmasked values. + + See Also + -------- + DataArray.interpolate_na + Dataset.interpolate_na + """ + if dim is None: + dim = self.dim + return self._apply_mask( + self.content.interpolate_na( + dim=dim, method=method, use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs + ) + ) def mask_gaps( self, - dim: Hashable | None = None, + dim: Hashable, use_coordinate: bool | str = True, limit: ( int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None @@ -578,7 +660,7 @@ def mask_gaps( max_gap, use_coordinate, ) - return GapMask(self, mask) + return GapMask(self, mask, dim) def interp_na( diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 371bf38e3c6..f9488d57b2d 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -1272,13 +1272,14 @@ def test_mask_gap_limit_2d(): dims=["x", "y"], coords={"x": np.arange(4) * 2}, ) - actual = da.fill_gaps( + mask = da.fill_gaps( dim="x", limit=3, limit_direction="forward", limit_area=None, use_coordinate=True, - ).interpolate_na( + ) + actual=mask.interpolate_na( "x", fill_value="extrapolate", method="linear", @@ -1292,6 +1293,20 @@ def test_mask_gap_limit_2d(): ] ) assert_equal(actual, expected) + # Test: Dim argument from mask should be used + actual=mask.interpolate_na( + fill_value="extrapolate", + method="linear", + ) + expected = da.copy( + data=[ + [1, 1, n, n, 1, 1], + [n, 2, 2, n, 2, 2], + [n, 3, 3, 3, 3, n], + [n, n, 4, 4, 4, 4], + ] + ) + assert_equal(actual, expected) def test_mask_gap_max_gap_2d(): From a64809ad5559951220b3a57bd862c63480811dff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:09:09 +0000 Subject: [PATCH 15/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 19 ++++++++++++------- xarray/tests/test_missing.py | 4 ++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 1c46facce1f..4bc9148db09 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -532,7 +532,7 @@ class GapMask: def __init__(self, content: DataArray | Dataset, mask: np.ndarray, dim: Hashable): self.content = content self.mask = mask - self.dim=dim + self.dim = dim def _apply_mask(self, filled): if self.mask is not None: @@ -560,7 +560,7 @@ def ffill(self, dim: Hashable | None = None): if dim is None: dim = self.dim return self._apply_mask(self.content.ffill(dim)) - + def bfill(self, dim: Hashable | None = None): """Partly fill missing values in this object's data by applying bfill to all unmasked values. @@ -582,7 +582,7 @@ def bfill(self, dim: Hashable | None = None): if dim is None: dim = self.dim return self._apply_mask(self.content.bfill(dim)) - + def fillna(self, value): """Partly fill missing values in this object's data by applying fillna to all unmasked values. @@ -592,7 +592,7 @@ def fillna(self, value): Used to fill all unmasked values. If the argument is a DataArray, it is first aligned with (reindexed to) this array. - + Returns ------- filled : same type as caller @@ -604,7 +604,7 @@ def fillna(self, value): Dataset.fillna """ return self._apply_mask(self.content.fillna(value)) - + def interpolate_na( self, dim: Hashable | None = None, @@ -623,7 +623,7 @@ def interpolate_na( ------- filled : same type as caller New object with interpolate_na applied to all unmasked values. - + See Also -------- DataArray.interpolate_na @@ -633,10 +633,15 @@ def interpolate_na( dim = self.dim return self._apply_mask( self.content.interpolate_na( - dim=dim, method=method, use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs + dim=dim, + method=method, + use_coordinate=use_coordinate, + keep_attrs=keep_attrs, + **kwargs, ) ) + def mask_gaps( self, dim: Hashable, diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index f9488d57b2d..d25e1279e0b 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -1279,7 +1279,7 @@ def test_mask_gap_limit_2d(): limit_area=None, use_coordinate=True, ) - actual=mask.interpolate_na( + actual = mask.interpolate_na( "x", fill_value="extrapolate", method="linear", @@ -1294,7 +1294,7 @@ def test_mask_gap_limit_2d(): ) assert_equal(actual, expected) # Test: Dim argument from mask should be used - actual=mask.interpolate_na( + actual = mask.interpolate_na( fill_value="extrapolate", method="linear", ) From 92c6b2a1a96cb3c88bf79e079a09c5867504e27e Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:28:09 +0200 Subject: [PATCH 16/23] Add type hints to most arguments, return types --- xarray/core/dataarray.py | 33 ++------- xarray/core/dataset.py | 33 ++------- xarray/core/missing.py | 148 ++++++++++++++++++++------------------- xarray/core/types.py | 2 + 4 files changed, 89 insertions(+), 127 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 030a29f36c9..cd4232e432f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -116,6 +116,7 @@ SideOptions, T_ChunkDimFreq, T_ChunksFreq, + T_GapLength, T_Xarray, ) from xarray.core.weighted import DataArrayWeighted @@ -3483,15 +3484,7 @@ def interpolate_na( method: InterpOptions = "linear", limit: int | None = None, use_coordinate: bool | Hashable = True, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: @@ -3775,27 +3768,11 @@ def fill_gaps( dim: Hashable, *, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, - ) -> GapMask: + max_gap: T_GapLength | None = None, + ) -> GapMask[DataArray]: """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8a5d4c0fcae..23da59afc3a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -167,6 +167,7 @@ SideOptions, T_ChunkDimFreq, T_DatasetPadConstantValues, + T_GapLength, T_Xarray, ) from xarray.core.weighted import DatasetWeighted @@ -6597,15 +6598,7 @@ def interpolate_na( method: InterpOptions = "linear", limit: int | None = None, use_coordinate: bool | Hashable = True, - max_gap: ( - int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - | None - ) = None, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs: Any, ) -> Self: @@ -6883,27 +6876,11 @@ def fill_gaps( dim: Hashable, *, use_coordinate: bool | Hashable = True, - limit: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, + limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, - max_gap: ( - None - | int - | float - | str - | pd.Timedelta - | np.timedelta64 - | datetime.timedelta - ) = None, - ) -> GapMask: + max_gap: T_GapLength | None = None, + ) -> GapMask[Dataset]: """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 4bc9148db09..438f00e7c6a 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,11 +1,10 @@ from __future__ import annotations -import datetime as dt import warnings from collections.abc import Callable, Hashable, Sequence from functools import partial from numbers import Number -from typing import TYPE_CHECKING, Any, get_args +from typing import TYPE_CHECKING, Any, Generic, get_args import numpy as np import pandas as pd @@ -24,6 +23,8 @@ InterpOptions, LimitAreaOptions, LimitDirectionOptions, + T_GapLength, + T_Xarray, ) from xarray.core.utils import OrderedSet, is_scalar from xarray.core.variable import Variable, broadcast_variables @@ -31,8 +32,7 @@ from xarray.namedarray.pycompat import is_chunked_array if TYPE_CHECKING: - from xarray.core.dataarray import DataArray - from xarray.core.dataset import Dataset + pass _FILL_MISSING_DOCSTRING_TEMPLATE = """\ @@ -51,8 +51,8 @@ def _get_gap_left_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False -): + obj: T_Xarray, dim: Hashable, index: Variable, outside=False +) -> T_Xarray: left = index.where(~obj.isnull()).ffill(dim).transpose(*obj.dims) if outside: return left.fillna(index[0]) @@ -60,8 +60,8 @@ def _get_gap_left_edge( def _get_gap_right_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, outside=False -): + obj: T_Xarray, dim: Hashable, index: Variable, outside=False +) -> T_Xarray: right = index.where(~obj.isnull()).bfill(dim).transpose(*obj.dims) if outside: return right.fillna(index[-1]) @@ -69,24 +69,24 @@ def _get_gap_right_edge( def _get_gap_dist_to_left_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): + obj: T_Xarray, dim: Hashable, index: Variable +) -> T_Xarray: return (index - _get_gap_left_edge(obj, dim, index)).transpose(*obj.dims) def _get_gap_dist_to_right_edge( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): + obj: T_Xarray, dim: Hashable, index: Variable +) -> T_Xarray: return (_get_gap_right_edge(obj, dim, index) - index).transpose(*obj.dims) def _get_limit_fill_mask( - obj: Dataset | DataArray | Variable, + obj: T_Xarray, dim: Hashable, index: Variable, - limit, - limit_direction, -): + limit: T_GapLength, + limit_direction: LimitDirectionOptions, +) -> T_Xarray: # At the left boundary, distance to left is nan. # For nan, a<=b and ~(a>b) behave differently if limit_direction == "forward": @@ -105,8 +105,8 @@ def _get_limit_fill_mask( def _get_limit_area_mask( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable, limit_area -): + obj: T_Xarray, dim: Hashable, index: Variable, limit_area +) -> T_Xarray: if limit_area == "inside": area_mask = ( _get_gap_left_edge(obj, dim, index).isnull() @@ -125,9 +125,7 @@ def _get_limit_area_mask( return area_mask -def _get_nan_block_lengths( - obj: Dataset | DataArray | Variable, dim: Hashable, index: Variable -): +def _get_nan_block_lengths(obj: T_Xarray, dim: Hashable, index: Variable) -> T_Xarray: """ Return an object where each NaN element in 'obj' is replaced by the length of the gap the element is in. @@ -138,25 +136,22 @@ def _get_nan_block_lengths( def _get_max_gap_mask( - obj: Dataset | DataArray | Variable, - dim: Hashable, - index: Variable, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta, -): + obj: T_Xarray, dim: Hashable, index: Variable, max_gap: T_GapLength +) -> T_Xarray: nan_block_lengths = _get_nan_block_lengths(obj, dim, index) return nan_block_lengths > max_gap def _get_gap_mask( - obj: Dataset | DataArray | Variable, + obj: T_Xarray, dim: Hashable, - limit=None, - limit_direction="both", - limit_area=None, + limit: T_GapLength | None = None, + limit_direction: LimitDirectionOptions = "both", + limit_area: LimitAreaOptions = None, limit_use_coordinate=False, - max_gap=None, + max_gap: T_GapLength = None, max_gap_use_coordinate=False, -): +) -> T_Xarray: # Input checking ##Limit if not is_scalar(limit): @@ -387,7 +382,9 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds -def _get_raw_interp_index(arr, dim: Hashable, use_coordinate: bool | Hashable = True): +def _get_raw_interp_index( + arr: T_Xarray, dim: Hashable, use_coordinate: bool | Hashable = True +) -> pd.Index: """Return index to use for x values in interpolation or curve fitting. In comparison to get_clean_interp_index, this function does not convert to numeric values.""" @@ -419,8 +416,11 @@ def _get_raw_interp_index(arr, dim: Hashable, use_coordinate: bool | Hashable = def get_clean_interp_index( - arr, dim: Hashable, use_coordinate: bool | Hashable = True, strict: bool = True -): + arr: T_Xarray, + dim: Hashable, + use_coordinate: bool | Hashable = True, + strict: bool = True, +) -> Variable: """Return index to use for x values in interpolation or curve fitting. Parameters @@ -487,22 +487,22 @@ def get_clean_interp_index( return index -def _is_time_index(index): +def _is_time_index(index) -> bool: from xarray.coding.cftimeindex import CFTimeIndex return isinstance(index, pd.DatetimeIndex | CFTimeIndex) def _interp_na_all( - self, - dim: Hashable | None = None, + obj: T_Xarray, + dim: Hashable, method: InterpOptions = "linear", use_coordinate: bool | str = True, keep_attrs: bool | None = None, **kwargs, -): +) -> T_Xarray: """Interpolate all nan values, without restrictions regarding the gap size.""" - index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) + index = get_clean_interp_index(obj, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) interpolator = partial(func_interpolate_na, interp_class, **kwargs) @@ -514,32 +514,37 @@ def _interp_na_all( warnings.filterwarnings("ignore", "invalid value", RuntimeWarning) arr = apply_ufunc( interpolator, - self, + obj, index.values, input_core_dims=[[dim], [dim]], output_core_dims=[[dim]], - output_dtypes=[self.dtype], + output_dtypes=[obj.dtype], dask="parallelized", vectorize=True, keep_attrs=keep_attrs, - ).transpose(*self.dims) + ).transpose(*obj.dims) return arr -class GapMask: +class GapMask(Generic[T_Xarray]): + content: T_Xarray + mask: np.ndarray + dim: Hashable + """An object that allows for flexible masking of gaps.""" - def __init__(self, content: DataArray | Dataset, mask: np.ndarray, dim: Hashable): + def __init__(self, content: T_Xarray, mask: np.ndarray, dim: Hashable) -> None: self.content = content self.mask = mask self.dim = dim + self.dim = dim - def _apply_mask(self, filled): + def _apply_mask(self, filled: T_Xarray) -> T_Xarray: if self.mask is not None: filled = filled.where(~self.mask, other=self.content) return filled - def ffill(self, dim: Hashable | None = None): + def ffill(self, dim: Hashable | None = None) -> T_Xarray: """Partly fill missing values in this object's data by applying ffill to all unmasked values. Parameters @@ -561,7 +566,7 @@ def ffill(self, dim: Hashable | None = None): dim = self.dim return self._apply_mask(self.content.ffill(dim)) - def bfill(self, dim: Hashable | None = None): + def bfill(self, dim: Hashable | None = None) -> T_Xarray: """Partly fill missing values in this object's data by applying bfill to all unmasked values. Parameters @@ -583,7 +588,7 @@ def bfill(self, dim: Hashable | None = None): dim = self.dim return self._apply_mask(self.content.bfill(dim)) - def fillna(self, value): + def fillna(self, value) -> T_Xarray: """Partly fill missing values in this object's data by applying fillna to all unmasked values. Parameters @@ -593,6 +598,7 @@ def fillna(self, value): argument is a DataArray, it is first aligned with (reindexed to) this array. + Returns ------- filled : same type as caller @@ -605,6 +611,7 @@ def fillna(self, value): """ return self._apply_mask(self.content.fillna(value)) + def interpolate_na( self, dim: Hashable | None = None, @@ -612,7 +619,7 @@ def interpolate_na( use_coordinate: bool | str = True, keep_attrs: bool | None = None, **kwargs: Any, - ): + ) -> T_Xarray: """Partly fill missing values in this object's data by applying interpolate_na to all unmasked values. Parameters @@ -624,6 +631,7 @@ def interpolate_na( filled : same type as caller New object with interpolate_na applied to all unmasked values. + See Also -------- DataArray.interpolate_na @@ -638,25 +646,29 @@ def interpolate_na( use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs, + dim=dim, + method=method, + use_coordinate=use_coordinate, + keep_attrs=keep_attrs, + **kwargs, ) ) + def mask_gaps( - self, + obj: T_Xarray, dim: Hashable, use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, + limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, -) -> GapMask: + max_gap: T_GapLength | None = None, +) -> GapMask[T_Xarray]: """Mask continuous gaps in the data, providing functionality to control gap length and offsets""" mask = _get_gap_mask( - self, + obj, dim, limit, limit_direction, @@ -665,26 +677,20 @@ def mask_gaps( max_gap, use_coordinate, ) - return GapMask(self, mask, dim) + return GapMask(obj, mask, dim) def interp_na( - self, - dim: Hashable | None = None, + obj: T_Xarray, + dim: Hashable, method: InterpOptions = "linear", use_coordinate: bool | str = True, - limit: ( - int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta | None - ) = None, - max_gap: int | float | str | pd.Timedelta | np.timedelta64 | dt.timedelta = None, + limit: T_GapLength | None = None, + max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, **kwargs, -): +) -> T_Xarray: """Interpolate values according to different methods.""" - # Preprocess arguments and do consistency checks - if dim is None: - raise NotImplementedError("dim is a required argument") - # This was the original behaviour of interp_na and is kept for backward compatibility # Limit=None: Fill everything, including both boundaries # Limit!=None: Do forward interpolation until limit @@ -695,7 +701,7 @@ def interp_na( limit_direction = "forward" limit_area = None mask = _get_gap_mask( - self, + obj, dim, limit, limit_direction, @@ -705,7 +711,7 @@ def interp_na( use_coordinate, ) - arr = _interp_na_all(self, dim, method, use_coordinate, keep_attrs, **kwargs) + arr = _interp_na_all(obj, dim, method, use_coordinate, keep_attrs, **kwargs) if mask is not None: arr = arr.where(~mask) return arr diff --git a/xarray/core/types.py b/xarray/core/types.py index 68cc9e07149..029a96a66bd 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -100,6 +100,7 @@ DatetimeLike: TypeAlias = ( pd.Timestamp | datetime.datetime | np.datetime64 | CFTimeDatetime ) +TimedeltaLike: TypeAlias = pd.Timedelta | datetime.timedelta | np.timedelta64 class Alignable(Protocol): @@ -220,6 +221,7 @@ def copy( ] InterpolantOptions = Literal["barycentric", "krogh", "pchip", "spline", "akima"] InterpOptions = Union[Interp1dOptions, InterpolantOptions] +T_GapLength = Union[int, float, str, TimedeltaLike] LimitDirectionOptions = Literal["forward", "backward", "both"] LimitAreaOptions = Literal["inside", "outside"] From 5452df1df2edbc30677e4878410f0e52149f8c04 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 25 Aug 2024 15:30:49 +0000 Subject: [PATCH 17/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/missing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 438f00e7c6a..29f73b965f6 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -611,7 +611,6 @@ def fillna(self, value) -> T_Xarray: """ return self._apply_mask(self.content.fillna(value)) - def interpolate_na( self, dim: Hashable | None = None, @@ -655,7 +654,6 @@ def interpolate_na( ) - def mask_gaps( obj: T_Xarray, dim: Hashable, From 2f53449aa9b183ad56217e31b959d40c12b42e1e Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:46:44 +0200 Subject: [PATCH 18/23] Fix accidental double pasting of arguments --- xarray/core/missing.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 29f73b965f6..1fdd5d2c7c6 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -645,11 +645,6 @@ def interpolate_na( use_coordinate=use_coordinate, keep_attrs=keep_attrs, **kwargs, - dim=dim, - method=method, - use_coordinate=use_coordinate, - keep_attrs=keep_attrs, - **kwargs, ) ) From 3696d634e09c4b491d7f99e8a1c7c106cc04c8a5 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 18:50:18 +0200 Subject: [PATCH 19/23] Fix more mypy errors --- xarray/core/missing.py | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 1fdd5d2c7c6..76681f2d487 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -84,7 +84,7 @@ def _get_limit_fill_mask( obj: T_Xarray, dim: Hashable, index: Variable, - limit: T_GapLength, + limit: int | float | np.number, limit_direction: LimitDirectionOptions, ) -> T_Xarray: # At the left boundary, distance to left is nan. @@ -136,7 +136,7 @@ def _get_nan_block_lengths(obj: T_Xarray, dim: Hashable, index: Variable) -> T_X def _get_max_gap_mask( - obj: T_Xarray, dim: Hashable, index: Variable, max_gap: T_GapLength + obj: T_Xarray, dim: Hashable, index: Variable, max_gap: int | float | np.number ) -> T_Xarray: nan_block_lengths = _get_nan_block_lengths(obj, dim, index) return nan_block_lengths > max_gap @@ -147,11 +147,11 @@ def _get_gap_mask( dim: Hashable, limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", - limit_area: LimitAreaOptions = None, + limit_area: LimitAreaOptions | None = None, limit_use_coordinate=False, - max_gap: T_GapLength = None, + max_gap: T_GapLength | None = None, max_gap_use_coordinate=False, -) -> T_Xarray: +) -> T_Xarray | None: # Input checking ##Limit if not is_scalar(limit): @@ -169,22 +169,25 @@ def _get_gap_mask( limit = timedelta_to_numeric(limit) ## Max_gap - if max_gap is not None: - if not is_scalar(max_gap): - raise ValueError("max_gap must be a scalar.") - - if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): - max_gap = timedelta_to_numeric(max_gap) + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + if max_gap is None: + max_gap = np.inf + else: if not max_gap_use_coordinate: if not isinstance(max_gap, Number | np.number): raise TypeError( f"Expected integer or floating point max_gap since use_coordinate=False. Received {type(max_gap).__name__}." ) + + if _is_time_index(_get_raw_interp_index(obj, dim, max_gap_use_coordinate)): + max_gap = timedelta_to_numeric(max_gap) + # Which masks are really needed? need_limit_mask = limit != np.inf or limit_direction != "both" need_area_mask = limit_area is not None - need_max_gap_mask = max_gap is not None + need_max_gap_mask = max_gap != np.inf # Calculate indexes if need_limit_mask or need_area_mask: index_limit = get_clean_interp_index( @@ -497,7 +500,7 @@ def _interp_na_all( obj: T_Xarray, dim: Hashable, method: InterpOptions = "linear", - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, keep_attrs: bool | None = None, **kwargs, ) -> T_Xarray: @@ -528,12 +531,12 @@ def _interp_na_all( class GapMask(Generic[T_Xarray]): content: T_Xarray - mask: np.ndarray + mask: T_Xarray | None dim: Hashable """An object that allows for flexible masking of gaps.""" - def __init__(self, content: T_Xarray, mask: np.ndarray, dim: Hashable) -> None: + def __init__(self, content: T_Xarray, mask: T_Xarray | None, dim: Hashable) -> None: self.content = content self.mask = mask self.dim = dim @@ -615,7 +618,7 @@ def interpolate_na( self, dim: Hashable | None = None, method: InterpOptions = "linear", - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, keep_attrs: bool | None = None, **kwargs: Any, ) -> T_Xarray: @@ -652,7 +655,7 @@ def interpolate_na( def mask_gaps( obj: T_Xarray, dim: Hashable, - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, limit: T_GapLength | None = None, limit_direction: LimitDirectionOptions = "both", limit_area: LimitAreaOptions | None = None, @@ -677,7 +680,7 @@ def interp_na( obj: T_Xarray, dim: Hashable, method: InterpOptions = "linear", - use_coordinate: bool | str = True, + use_coordinate: bool | Hashable = True, limit: T_GapLength | None = None, max_gap: T_GapLength | None = None, keep_attrs: bool | None = None, @@ -688,10 +691,7 @@ def interp_na( # Limit=None: Fill everything, including both boundaries # Limit!=None: Do forward interpolation until limit limit_use_coordinate = False - if limit is None: - limit_direction = "both" - else: - limit_direction = "forward" + limit_direction: LimitDirectionOptions = "both" if limit is None else "forward" limit_area = None mask = _get_gap_mask( obj, From d5d56ae3dd88e625b3ce1ac1d8941401f37aefe4 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:07:09 +0200 Subject: [PATCH 20/23] Bottleneck is required for limit functionality --- xarray/core/dataarray.py | 2 ++ xarray/core/dataset.py | 2 ++ xarray/tests/test_missing.py | 13 +++++++++++++ 3 files changed, 17 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index cd4232e432f..ef58616b426 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3776,6 +3776,8 @@ def fill_gaps( """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + *Requires bottleneck.* + Parameters ---------- dim : Hashable diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 23da59afc3a..6ff6026243e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6884,6 +6884,8 @@ def fill_gaps( """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. + *Requires bottleneck.* + Parameters ---------- dim : Hashable diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index d25e1279e0b..8135e9b65c2 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -366,6 +366,7 @@ def test_interpolate_limits(): assert_equal(actual, expected) +@requires_bottleneck def test_interpolate_double_coordinate(): # Check if max_gap is able to handle string coordinate names # Limit is always refering to an index @@ -633,6 +634,7 @@ def test_bfill_dataset(ds): ds.ffill(dim="time") +@requires_bottleneck def test_get_gap_left_edge(): n = np.nan arr = [ @@ -664,6 +666,7 @@ def test_get_gap_left_edge(): ) +@requires_bottleneck def test_get_gap_right_edge(): n = np.nan arr = [ @@ -695,6 +698,7 @@ def test_get_gap_right_edge(): ) +@requires_bottleneck def test_get_gap_dist_to_left_edge(): n = np.nan arr = [ @@ -718,6 +722,7 @@ def test_get_gap_dist_to_left_edge(): expected = da.copy(data=[[n, 0, 3, 4, 5, 6, 8, 10, 0], [n, n, n, 0, 1, 2, 0, 2, 4]]) +@requires_bottleneck def test_get_gap_dist_to_right_edge(): n = np.nan arr = [ @@ -768,6 +773,7 @@ def test_get_nan_block_lengths(y, lengths_expected): assert_equal(actual, expected) +@requires_bottleneck def test_get_nan_block_lengths_2d(): n = np.nan da = xr.DataArray( @@ -804,6 +810,7 @@ def test_get_nan_block_lengths_2d(): assert_equal(actual, expected_y) +@requires_bottleneck def test_get_limit_fill_mask(): T = True F = False @@ -856,6 +863,7 @@ def test_get_limit_fill_mask(): assert_equal(actual, expected) +@requires_bottleneck def test_get_area_mask(): T = True F = False @@ -1064,6 +1072,7 @@ def test_interpolate_na_max_gap_2d(coords): assert_equal(actual, expected_x) +@requires_bottleneck def test_interpolate_na_limit_2d(): n = np.nan times = pd.date_range("2000-01-01", periods=12, freq="3h") @@ -1113,6 +1122,7 @@ def test_interpolators_complex_out_of_bounds(): ####Masking Functionality +@requires_bottleneck def test_fill_gaps_limit(): n = np.nan times = pd.date_range("2000-01-01", periods=8, freq="2h") @@ -1149,6 +1159,7 @@ def test_fill_gaps_limit(): assert_equal(actual, expected) +@requires_bottleneck def test_mask_gap_limit_2d(): n = np.nan times = pd.date_range("2000-01-01", periods=12, freq="3h") @@ -1309,6 +1320,7 @@ def test_mask_gap_limit_2d(): assert_equal(actual, expected) +@requires_bottleneck def test_mask_gap_max_gap_2d(): n = np.nan times = pd.date_range("2000-01-01", periods=12, freq="3h") @@ -1358,6 +1370,7 @@ def test_mask_gap_max_gap_2d(): assert_equal(actual, expected) +@requires_bottleneck def test_mask_double_coordinate(): # Check if limit and max_gap are able to handle string coordinate names n = np.nan From 7570b62d9d3b1200173ba9512a577a4f70bd577f Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:21:43 +0200 Subject: [PATCH 21/23] Docs: Require numbagg or bottleneck for ffill/bfill/fill_gaps --- xarray/core/dataarray.py | 6 +++--- xarray/core/dataset.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ef58616b426..43552469fcb 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3598,7 +3598,7 @@ def interpolate_na( def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -3682,7 +3682,7 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: def bfill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values backward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -3776,7 +3776,7 @@ def fill_gaps( """Fill in gaps in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6ff6026243e..38b8351f91c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6745,7 +6745,7 @@ def interpolate_na( def ffill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values forward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -6809,7 +6809,7 @@ def ffill(self, dim: Hashable, limit: int | None = None) -> Self: def bfill(self, dim: Hashable, limit: int | None = None) -> Self: """Fill NaN values by propagating values backward - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- @@ -6881,10 +6881,10 @@ def fill_gaps( limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[Dataset]: - """Fill in gaps in the data using one of several filling methods. + """Fill in gaps (consecutive missing values) in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. - *Requires bottleneck.* + *Requires numbagg or bottleneck.* Parameters ---------- From f19f626795512711f468a2fb898b12842491fbe9 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:14:32 +0200 Subject: [PATCH 22/23] Rework index conversion to have consistent typing --- xarray/core/dataarray.py | 2 +- xarray/core/missing.py | 36 +++++++++++++++++----------------- xarray/tests/test_dataarray.py | 8 +++++--- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 43552469fcb..d3757df9bf1 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3773,7 +3773,7 @@ def fill_gaps( limit_area: LimitAreaOptions | None = None, max_gap: T_GapLength | None = None, ) -> GapMask[DataArray]: - """Fill in gaps in the data using one of several filling methods. + """Fill in gaps (consecutive missing values) in the data using one of several filling methods. Allows for fine control on how far to extend the valid data into the gaps and the maximum size of the gaps to fill. *Requires numbagg or bottleneck.* diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 76681f2d487..c361ddb0791 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -470,24 +470,24 @@ def get_clean_interp_index( if isinstance(index, CFTimeIndex | pd.DatetimeIndex): offset = type(index[0])(1970, 1, 1) if isinstance(index, CFTimeIndex): - index = index.values - index = Variable( - data=datetime_to_numeric(index, offset=offset, datetime_unit="ns"), - dims=(dim,), - ) - - # raise if index cannot be cast to a float (e.g. MultiIndex) - try: - index = index.values.astype(np.float64) - except (TypeError, ValueError): - # pandas raises a TypeError - # xarray/numpy raise a ValueError - raise TypeError( - f"Index {index.name!r} must be castable to float64 to support " - f"interpolation or curve fitting, got {type(index).__name__}." - ) - index = Variable([dim], index) - return index + values = datetime_to_numeric( + index.values, offset=offset, datetime_unit="ns" + ) + else: + values = datetime_to_numeric(index, offset=offset, datetime_unit="ns") + else: # if numeric or standard calendar index: try to cast to float + try: + values = index.values.astype(np.float64) + # raise if index cannot be cast to a float (e.g. MultiIndex) + except (TypeError, ValueError): + # pandas raises a TypeError + # xarray/numpy raise a ValueError + raise TypeError( + f"Index {index.name!r} must be castable to float64 to support " + f"interpolation or curve fitting, got {type(index).__name__}." + ) + var = Variable([dim], values) + return var def _is_time_index(index) -> bool: diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 9feab73d3d1..da9d4b2bbd8 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4250,11 +4250,13 @@ def test_rank(self) -> None: def test_polyfit(self, use_dask, use_datetime) -> None: if use_dask and not has_dask: pytest.skip("requires dask") - xcoord = xr.DataArray( + da_times = xr.DataArray( pd.date_range("1970-01-01", freq="D", periods=10), dims=("x",), name="x" ) - x = xr.core.missing.get_clean_interp_index(xcoord, "x") - if not use_datetime: + x = xr.core.missing.get_clean_interp_index(da_times, "x").values + if use_datetime: + xcoord = da_times.values + else: xcoord = x da_raw = DataArray( From b5025ffc00710217fea55f0a00ed3335c22554a0 Mon Sep 17 00:00:00 2001 From: Paul Ockenfuss <42680748+Ockenfuss@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:30:02 +0200 Subject: [PATCH 23/23] Add new method to api.rst --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 6ed8d513934..3ff8b8bea35 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -167,6 +167,7 @@ Missing value handling Dataset.fillna Dataset.ffill Dataset.bfill + Dataset.fill_gaps Dataset.interpolate_na Dataset.where Dataset.isin @@ -357,6 +358,7 @@ Missing value handling DataArray.fillna DataArray.ffill DataArray.bfill + DataArray.fill_gaps DataArray.interpolate_na DataArray.where DataArray.isin