Skip to content

Commit

Permalink
fix(core): Add regex and keep to extractlevel (#53)
Browse files Browse the repository at this point in the history
* fix(core): Add regex and keep to extractlevel

* Update CHANGELOG

* Fix tests and ignore unnamed capture groups
  • Loading branch information
coroa authored Apr 9, 2024
1 parent 7e5ea57 commit 104e756
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 37 deletions.
14 changes: 13 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,19 @@
Changelog
=========

* Update :func:`~core.projectlevel` to raise ``KeyError`` for wrong level names.
v0.5.0 (2024-04-09)
------------------------------------------------------------
* **BREAKING**: Change :func:`~core.extractlevel` to drop split levels by default and
accordingly rename the governing argument from ``drop=False`` to ``keep=False``
:pull:`53`.
* Add ``regex=True`` argument to :func:`~core.extractlevel` to use templates as
manual extraction regex, f.ex.
``df.pix.extract(variable=r"Emissions\|(?P<gas>.*?)(?:\|(?P<sector>.*?))?",
regex=True)`` will also split ``Emissions|CO2`` to ``gas = "CO2"`` and
``sector = NaN``, while ``df.pix.extract(variable="Emissions|{gas}|{sector}")`` would
have dropped it.
* Update :func:`~core.projectlevel` to raise ``KeyError`` for wrong level names
:pull:`52`.

v0.4.2 (2024-04-03)
------------------------------------------------------------
Expand Down
27 changes: 25 additions & 2 deletions src/pandas_indexing/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
>>> df.pix.multiply(other, how="left")
"""

import warnings
from typing import Any, Callable, Dict, Literal, Mapping, Optional, Sequence, Union

import pandas as pd
Expand Down Expand Up @@ -71,9 +72,31 @@ def assign(

@doc(extractlevel, index_or_data="")
def extract(
self, template: Optional[str] = None, *, axis: Axis = 0, **templates: str
self,
template: Optional[str] = None,
*,
keep: bool = False,
dropna: bool = True,
regex: bool = False,
axis: Axis = 0,
drop: Optional[bool] = None,
**templates: str,
) -> Union[DataFrame, Series, Index]:
return extractlevel(self._obj, template, axis=axis, **templates)
if drop is not None:
warnings.warn(
"Argument `drop` is deprecated (use `keep` instead)", DeprecationWarning
)
keep = not drop

return extractlevel(
self._obj,
template,
keep=keep,
dropna=dropna,
regex=regex,
axis=axis,
**templates,
)

@doc(formatlevel, index_or_data="")
def format(
Expand Down
88 changes: 68 additions & 20 deletions src/pandas_indexing/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import re
import warnings
from functools import reduce
from itertools import chain, product
from operator import and_, or_
Expand Down Expand Up @@ -671,7 +672,11 @@ def antijoin(


def _extractlevel(
index: Index, template: Optional[str] = None, drop: bool = False, **templates: str
index: Index,
template: Optional[str] = None,
keep: bool = False,
regex: bool = False,
**templates: str,
) -> Tuple[Index, List[str]]:
index = ensure_multiindex(index)
all_identifiers = set()
Expand All @@ -682,27 +687,33 @@ def _extractlevel(
templates[index.names[0]] = template

for dim, template in templates.items():
identifiers = re.findall(r"\{([a-zA-Z_]+)\}", template)
all_identifiers.update(identifiers)
if dim not in index.names:
raise ValueError(f"{dim} not a dimension of index: {index.names}")

levelnum = index.names.index(dim)
labels = index.levels[levelnum]
codes = index.codes[levelnum]

regex_pattern = reduce(
lambda s, ident: s.replace(rf"\{{{ident}\}}", rf"(?P<{ident}>.*?)"),
identifiers,
re.escape(template),
)
components = labels.str.extract(f"^{regex_pattern}$", expand=True)
if regex:
regex_pattern = re.compile(f"^{template}$")
identifiers = list(regex_pattern.groupindex)
else:
identifiers = re.findall(r"\{([a-zA-Z_]+)\}", template)
regex_pattern = reduce(
lambda s, ident: s.replace(rf"\{{{ident}\}}", rf"(?P<{ident}>.*?)"),
identifiers,
re.escape(template),
)
regex_pattern = re.compile(f"^{regex_pattern}$")

components = labels.str.extract(regex_pattern, expand=True)

all_identifiers.update(identifiers)
index = assignlevel(
index, **{ident: components[ident].values[codes] for ident in identifiers}
)

if drop:
if not keep:
index = index.droplevel(list(set(templates) - all_identifiers))

return index, list(all_identifiers)
Expand All @@ -718,8 +729,10 @@ def extractlevel(
index_or_data: T,
template: Optional[str] = None,
*,
drop: bool = False,
keep: bool = False,
dropna: bool = True,
regex: bool = False,
drop: Optional[bool] = None,
axis: Axis = 0,
**templates: str,
) -> T:
Expand All @@ -736,12 +749,17 @@ def extractlevel(
{index_or_data}
template : str, optional
Extraction template for a single level
drop : bool, default False
keep : bool, default False
Whether to keep the split dimension
dropna : bool, default True
Whether to drop the non-matching levels
regex : bool, default False
Whether templates are given as regular expressions
(regexes must use named captures)
axis : {{0, 1, "index", "columns"}}, default 0
Axis of DataFrame to extract from
drop : bool, optional
Deprecated argument, use keep instead
**templates : str
Templates for splitting one or multiple levels
Expand All @@ -759,9 +777,12 @@ def extractlevel(
Examples
--------
>>> s = Series(
... range(3),
... range(4),
... MultiIndex.from_arrays(
... [["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal"], ["GWh", "GWh", "EJ"]],
... [
... ["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal", "SE|Elec"],
... ["GWh", "GWh", "EJ", "GWh"],
... ],
... names=["variable", "unit"],
... ),
... )
Expand All @@ -770,22 +791,37 @@ def extractlevel(
SE|Elec|Bio GWh 0
SE|Elec|Coal GWh 1
PE|Coal EJ 2
SE|Elec GWh 3
dtype: int64
>>> extractlevel(s, variable="SE|{{type}}|{{fuel}}")
>>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", keep=True)
variable unit type fuel
SE|Elec|Bio GWh Elec Bio 0
SE|Elec|Coal GWh Elec Coal 1
dtype: int64
>>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", dropna=False)
>>> extractlevel(s, variable="SE|{{type}}|{{fuel}}")
unit type fuel
GWh Elec Bio 0
GWh Elec Coal 1
dtype: int64
>>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", keep=True, dropna=False)
variable unit type fuel
SE|Elec|Bio GWh Elec Bio 0
SE|Elec|Coal GWh Elec Coal 1
PE|Coal EJ NaN NaN 2
SE|Elec GWh NaN NaN 3
dtype: int64
>>> extractlevel(s, variable=r"SE\\|(?P<type>.*?)(?:\\|(?P<fuel>.*?))?", regex=True)
unit type fuel
GWh Elec Bio 0
GWh Elec Coal 1
GWh Elec NaN 3
dtype: int64
>>> s = Series(range(3), ["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal"])
>>> extractlevel(s, "SE|{{type}}|{{fuel}}", drop=True)
>>> extractlevel(s, "SE|{{type}}|{{fuel}}")
type fuel
Elec Bio 0
Coal 1
Expand All @@ -794,19 +830,31 @@ def extractlevel(
See also
--------
formatlevel
.. versionchanged:: 0.5.0
*drop* replaced by *keep* and default changed to not keep.
*regex* added.
"""
if drop is not None:
warnings.warn(
"Argument `drop` is deprecated (use `keep` instead)", DeprecationWarning
)
keep = not drop

if isinstance(index_or_data, Index):
index_or_data, identifiers = _extractlevel(
index_or_data, template, drop, **templates
index_or_data, template, keep=keep, regex=regex, **templates
)
else:
index, identifiers = _extractlevel(
get_axis(index_or_data, axis), template, drop, **templates
get_axis(index_or_data, axis), template, keep=keep, regex=regex, **templates
)
index_or_data = index_or_data.set_axis(index, axis=axis)

if dropna:
index_or_data = dropnalevel(index_or_data, subset=identifiers, axis=axis)
index_or_data = dropnalevel(
index_or_data, subset=identifiers, how="all", axis=axis
)

return index_or_data

Expand Down
24 changes: 24 additions & 0 deletions tests/test_accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,30 @@ def test_aggregate(mdf):
)


def test_extract():
midx = MultiIndex.from_arrays(
[["e|foo", "e|bar", "bar"], [1, 2, 3]], names=["var", "num"]
)

assert_index_equal(
midx.pix.extract(var="{e}|{typ}"),
MultiIndex.from_arrays(
[[1, 2], ["e", "e"], ["foo", "bar"]],
names=["num", "e", "typ"],
),
)

# drop=False
with pytest.warns(DeprecationWarning):
assert_index_equal(
midx.pix.extract(var="{e}|{typ}", drop=False),
MultiIndex.from_arrays(
[["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
names=["var", "num", "e", "typ"],
),
)


def test_add_zeros_like(mdf):
reference = MultiIndex.from_arrays(
[["foo", "foo", "bar", "baz"], [1, 2, 3, 4], ["a", "b", "c", "d"]],
Expand Down
42 changes: 28 additions & 14 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ def test_extractlevel(mdf, mseries, midx):
mseries = mseries.set_axis(midx)

expected_idx = MultiIndex.from_arrays(
[["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
names=["var", "num", "e", "typ"],
[[1, 2], ["e", "e"], ["foo", "bar"]],
names=["num", "e", "typ"],
)

assert_index_equal(extractlevel(midx, var="{e}|{typ}"), expected_idx)
Expand All @@ -248,32 +248,27 @@ def test_extractlevel_options(mdf):
)
mdf_t = mdf.T.set_axis(midx, axis=1)

# drop=True
# keep=True
assert_index_equal(
extractlevel(midx, var="{e}|{typ}", drop=True),
extractlevel(midx, var="{e}|{typ}", keep=True),
MultiIndex.from_arrays(
[[1, 2], ["e", "e"], ["foo", "bar"]],
names=["num", "e", "typ"],
[["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
names=["var", "num", "e", "typ"],
),
)

# dropna=False
assert_index_equal(
extractlevel(midx, var="{e}|{typ}", dropna=False),
MultiIndex.from_arrays(
[
["e|foo", "e|bar", "bar"],
[1, 2, 3],
["e", "e", nan],
["foo", "bar", nan],
],
names=["var", "num", "e", "typ"],
[[1, 2, 3], ["e", "e", nan], ["foo", "bar", nan]],
names=["num", "e", "typ"],
),
)

# axis=1
assert_frame_equal(
extractlevel(mdf_t, var="{e}|{typ}", drop=True, axis=1),
extractlevel(mdf_t, var="{e}|{typ}", axis=1),
mdf_t.iloc[:, [0, 1]].set_axis(
MultiIndex.from_arrays(
[[1, 2], ["e", "e"], ["foo", "bar"]],
Expand All @@ -283,6 +278,25 @@ def test_extractlevel_options(mdf):
),
)

# regex
assert_index_equal(
extractlevel(midx, var=r"((?P<e>.*?)\|)?(?P<typ>.*?)", regex=True),
MultiIndex.from_arrays(
[[1, 2, 3], ["e", "e", nan], ["foo", "bar", "bar"]],
names=["num", "e", "typ"],
),
)

# drop=True
with pytest.warns(DeprecationWarning):
assert_index_equal(
extractlevel(midx, var="{e}|{typ}", drop=False),
MultiIndex.from_arrays(
[["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
names=["var", "num", "e", "typ"],
),
)

with pytest.raises(ValueError):
# mdf does not have the var level
extractlevel(mdf, var="{e}|{typ}")
Expand Down

0 comments on commit 104e756

Please sign in to comment.