From 81944eeff58df11f49f68226f5683ff7c74b6ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20H=C3=B6rsch?= Date: Wed, 3 Apr 2024 23:58:06 +0200 Subject: [PATCH] feat(core): Add add_zeros_like function (#51) * feat(core): Add add_zeros_like function * Update CHANGELOG --- CHANGELOG.rst | 4 +++ requirements.txt | 1 - src/pandas_indexing/accessors.py | 11 ++++++ src/pandas_indexing/core.py | 61 ++++++++++++++++++++++++++++++++ tests/test_accessors.py | 13 +++++++ tests/test_core.py | 42 ++++++++++++++++++++++ 6 files changed, 131 insertions(+), 1 deletion(-) delete mode 100644 requirements.txt diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f23ab6e..2c13440 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,6 +3,10 @@ Changelog ========= +v0.4.2 (2023-04-03) +------------------------------------------------------------ +* Add :func:`~core.add_zeros_like` for adding explicit `levels` as 0 values :pull:`51` + v0.4.1 (2023-03-20) ------------------------------------------------------------ * Add :func:`~core.antijoin` for performing anti-joins :pull:`48` diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9c558e3..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -. diff --git a/src/pandas_indexing/accessors.py b/src/pandas_indexing/accessors.py index d186b76..884d6ff 100644 --- a/src/pandas_indexing/accessors.py +++ b/src/pandas_indexing/accessors.py @@ -19,6 +19,7 @@ from . import arithmetics from .core import ( + add_zeros_like, aggregatelevel, antijoin, assignlevel, @@ -208,6 +209,16 @@ def aggregate( self._obj, agg_func=agg_func, axis=axis, dropna=dropna, mode=mode, **levels ) + @doc(add_zeros_like, data="") + def add_zeros_like( + self, + reference: Union[MultiIndex, DataFrame, Series], + /, + derive: Optional[Dict[str, MultiIndex]] = None, + **levels: Sequence[str], + ): + return add_zeros_like(self._obj, reference=reference, derive=derive, **levels) + def _create_forward_binop(op): def forward_binop( diff --git a/src/pandas_indexing/core.py b/src/pandas_indexing/core.py index 70f3be2..dca511e 100644 --- a/src/pandas_indexing/core.py +++ b/src/pandas_indexing/core.py @@ -1075,3 +1075,64 @@ def has_any_label(index: MultiIndex, level: str, labels: Sequence[Any]): raise ValueError( f'mode must be "replace", "append" or "return", but is "{mode}"' ) + + +@doc( + data=""" + data : Data + Series or DataFrame to extend with zeros\ + """ +) +def add_zeros_like( + data: T, + reference: Union[MultiIndex, DataFrame, Series], + *, + derive: Optional[Dict[str, MultiIndex]] = None, + **levels: Sequence[str], +) -> T: + """Add explicit `levels` to `data` as 0 values. + + Remaining levels in `data` not found in `levels` or `derive` are taken from + `reference` (or its index). + + Parameters + ----------\ + {data} + reference : Index + expected level labels (like model, scenario combinations) + derive : dict + derive labels in a level from a multiindex with allowed combinations + **levels : [str] + which labels should be added to df + + Returns + ------- + DataFrame + unsorted data with additional zero data + """ + + if any(len(labels) == 0 for labels in levels.values()): + return data + + if isinstance(reference, (Series, DataFrame)): + reference = reference.index + + if derive is None: + derive = {} + + target_levels = data.index.names + index = reference.pix.unique( + target_levels.difference(levels.keys()).difference(derive.keys()) + ) + + zero_index = concat( + reduce( + lambda ind, d: ind.join(d, how="left"), + derive.values(), + index.pix.assign(**dict(zip(levels.keys(), labels))), + ).reorder_levels(target_levels) + for labels in product(*levels.values()) + ) + zero_index = antijoin(zero_index, data.index) + + return concat([data, pd.DataFrame(0, index=zero_index, columns=data.columns)]) diff --git a/tests/test_accessors.py b/tests/test_accessors.py index 28c1e8e..0c9e98d 100644 --- a/tests/test_accessors.py +++ b/tests/test_accessors.py @@ -9,6 +9,8 @@ from pandas import DataFrame, Index, MultiIndex from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal +import pandas_indexing # noqa: F401 + def test_assign_index(midx: MultiIndex): """ @@ -138,3 +140,14 @@ def test_aggregate(mdf): MultiIndex.from_tuples([("bar", 3), ("foo", "new")], names=["str", "num"]), ), ) + + +def test_add_zeros_like(mdf): + reference = MultiIndex.from_arrays( + [["foo", "foo", "bar", "baz"], [1, 2, 3, 4], ["a", "b", "c", "d"]], + names=["str", "num", "new"], + ) + assert_frame_equal( + mdf.pix.add_zeros_like(reference), + mdf.reindex(reference.droplevel("new"), fill_value=0), + ) diff --git a/tests/test_core.py b/tests/test_core.py index 4050fce..6b3cd82 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -13,6 +13,7 @@ from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal from pandas_indexing.core import ( + add_zeros_like, aggregatelevel, antijoin, assignlevel, @@ -612,3 +613,44 @@ def test_aggregatelevel(mdf): with pytest.raises(ValueError): aggregatelevel(mdf, num=dict(new=[1, 2]), mode="bla") + + +def test_add_zeros_like(mdf): + reference = MultiIndex.from_arrays( + [["foo", "foo", "bar", "baz"], [1, 2, 3, 4], ["a", "b", "c", "d"]], + names=["str", "num", "new"], + ) + assert_frame_equal( + add_zeros_like(mdf, reference), + mdf.reindex(reference.droplevel("new"), fill_value=0), + ) + + assert_frame_equal( + add_zeros_like(mdf, Series(0, reference)), + mdf.reindex(reference.droplevel("new"), fill_value=0), + ) + + assert_frame_equal(add_zeros_like(mdf, reference, blub=[]), mdf) + + missing = MultiIndex.from_arrays( + [["bar", "baz", "foo", "baz"], [2, 2, 3, 3]], names=["str", "num"] + ) + assert_frame_equal( + add_zeros_like(mdf, reference, num=[2, 3]), + mdf.reindex(mdf.index.append(missing), fill_value=0), + ) + + def add_first(df): + index = df if isinstance(df, Index) else df.index + return assignlevel(df, first=projectlevel(index, "str").str[:1]) + + mdf_w_first = add_first(mdf) + assert_frame_equal( + add_zeros_like( + mdf_w_first, + reference, + num=[2, 3], + derive=dict(first=add_first(Index(["foo", "bar", "baz"], name="str"))), + ), + mdf_w_first.reindex(mdf_w_first.index.append(add_first(missing)), fill_value=0), + )