fix(core): Add regex and keep to extractlevel (#53)

* fix(core): Add regex and keep to extractlevel * Update CHANGELOG * Fix tests and ignore unnamed capture groups
coroa · Apr 9, 2024 · 104e756 · 104e756
1 parent 7e5ea57
commit 104e756
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 37 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -3,7 +3,19 @@
 Changelog
 =========
 
-* Update :func:`~core.projectlevel` to raise ``KeyError`` for wrong level names.
+v0.5.0 (2024-04-09)
+------------------------------------------------------------
+* **BREAKING**: Change :func:`~core.extractlevel` to drop split levels by default and
+  accordingly rename the governing argument from ``drop=False`` to ``keep=False``
+  :pull:`53`.
+* Add ``regex=True`` argument to :func:`~core.extractlevel` to use templates as
+  manual extraction regex, f.ex.
+  ``df.pix.extract(variable=r"Emissions\|(?P<gas>.*?)(?:\|(?P<sector>.*?))?",
+  regex=True)`` will also split ``Emissions|CO2`` to ``gas = "CO2"`` and
+  ``sector = NaN``, while ``df.pix.extract(variable="Emissions|{gas}|{sector}")`` would
+  have dropped it.
+* Update :func:`~core.projectlevel` to raise ``KeyError`` for wrong level names
+  :pull:`52`.
 
 v0.4.2 (2024-04-03)
 ------------------------------------------------------------

diff --git a/src/pandas_indexing/accessors.py b/src/pandas_indexing/accessors.py
@@ -10,6 +10,7 @@
 >>> df.pix.multiply(other, how="left")
 """
 
+import warnings
 from typing import Any, Callable, Dict, Literal, Mapping, Optional, Sequence, Union
 
 import pandas as pd
@@ -71,9 +72,31 @@ def assign(
 
     @doc(extractlevel, index_or_data="")
     def extract(
-        self, template: Optional[str] = None, *, axis: Axis = 0, **templates: str
+        self,
+        template: Optional[str] = None,
+        *,
+        keep: bool = False,
+        dropna: bool = True,
+        regex: bool = False,
+        axis: Axis = 0,
+        drop: Optional[bool] = None,
+        **templates: str,
     ) -> Union[DataFrame, Series, Index]:
-        return extractlevel(self._obj, template, axis=axis, **templates)
+        if drop is not None:
+            warnings.warn(
+                "Argument `drop` is deprecated (use `keep` instead)", DeprecationWarning
+            )
+            keep = not drop
+
+        return extractlevel(
+            self._obj,
+            template,
+            keep=keep,
+            dropna=dropna,
+            regex=regex,
+            axis=axis,
+            **templates,
+        )
 
     @doc(formatlevel, index_or_data="")
     def format(

diff --git a/src/pandas_indexing/core.py b/src/pandas_indexing/core.py
@@ -3,6 +3,7 @@
 """
 
 import re
+import warnings
 from functools import reduce
 from itertools import chain, product
 from operator import and_, or_
@@ -671,7 +672,11 @@ def antijoin(
 
 
 def _extractlevel(
-    index: Index, template: Optional[str] = None, drop: bool = False, **templates: str
+    index: Index,
+    template: Optional[str] = None,
+    keep: bool = False,
+    regex: bool = False,
+    **templates: str,
 ) -> Tuple[Index, List[str]]:
     index = ensure_multiindex(index)
     all_identifiers = set()
@@ -682,27 +687,33 @@ def _extractlevel(
         templates[index.names[0]] = template
 
     for dim, template in templates.items():
-        identifiers = re.findall(r"\{([a-zA-Z_]+)\}", template)
-        all_identifiers.update(identifiers)
         if dim not in index.names:
             raise ValueError(f"{dim} not a dimension of index: {index.names}")
 
         levelnum = index.names.index(dim)
         labels = index.levels[levelnum]
         codes = index.codes[levelnum]
 
-        regex_pattern = reduce(
-            lambda s, ident: s.replace(rf"\{{{ident}\}}", rf"(?P<{ident}>.*?)"),
-            identifiers,
-            re.escape(template),
-        )
-        components = labels.str.extract(f"^{regex_pattern}$", expand=True)
+        if regex:
+            regex_pattern = re.compile(f"^{template}$")
+            identifiers = list(regex_pattern.groupindex)
+        else:
+            identifiers = re.findall(r"\{([a-zA-Z_]+)\}", template)
+            regex_pattern = reduce(
+                lambda s, ident: s.replace(rf"\{{{ident}\}}", rf"(?P<{ident}>.*?)"),
+                identifiers,
+                re.escape(template),
+            )
+            regex_pattern = re.compile(f"^{regex_pattern}$")
+
+        components = labels.str.extract(regex_pattern, expand=True)
 
+        all_identifiers.update(identifiers)
         index = assignlevel(
             index, **{ident: components[ident].values[codes] for ident in identifiers}
         )
 
-    if drop:
+    if not keep:
         index = index.droplevel(list(set(templates) - all_identifiers))
 
     return index, list(all_identifiers)
@@ -718,8 +729,10 @@ def extractlevel(
     index_or_data: T,
     template: Optional[str] = None,
     *,
-    drop: bool = False,
+    keep: bool = False,
     dropna: bool = True,
+    regex: bool = False,
+    drop: Optional[bool] = None,
     axis: Axis = 0,
     **templates: str,
 ) -> T:
@@ -736,12 +749,17 @@ def extractlevel(
     {index_or_data}
     template : str, optional
         Extraction template for a single level
-    drop : bool, default False
+    keep : bool, default False
         Whether to keep the split dimension
     dropna : bool, default True
         Whether to drop the non-matching levels
+    regex : bool, default False
+        Whether templates are given as regular expressions
+        (regexes must use named captures)
     axis : {{0, 1, "index", "columns"}}, default 0
         Axis of DataFrame to extract from
+    drop : bool, optional
+        Deprecated argument, use keep instead
     **templates : str
         Templates for splitting one or multiple levels
 
@@ -759,9 +777,12 @@ def extractlevel(
     Examples
     --------
     >>> s = Series(
-    ...     range(3),
+    ...     range(4),
     ...     MultiIndex.from_arrays(
-    ...         [["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal"], ["GWh", "GWh", "EJ"]],
+    ...         [
+    ...             ["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal", "SE|Elec"],
+    ...             ["GWh", "GWh", "EJ", "GWh"],
+    ...         ],
     ...         names=["variable", "unit"],
     ...     ),
     ... )
@@ -770,22 +791,37 @@ def extractlevel(
     SE|Elec|Bio   GWh     0
     SE|Elec|Coal  GWh     1
     PE|Coal       EJ      2
+    SE|Elec       GWh     3
     dtype: int64
-    >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}")
+    >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", keep=True)
     variable      unit  type  fuel
     SE|Elec|Bio   GWh   Elec  Bio     0
     SE|Elec|Coal  GWh   Elec  Coal    1
     dtype: int64
 
-    >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", dropna=False)
+    >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}")
+    unit  type  fuel
+    GWh   Elec  Bio     0
+    GWh   Elec  Coal    1
+    dtype: int64
+
+    >>> extractlevel(s, variable="SE|{{type}}|{{fuel}}", keep=True, dropna=False)
     variable      unit  type  fuel
     SE|Elec|Bio   GWh   Elec  Bio     0
     SE|Elec|Coal  GWh   Elec  Coal    1
     PE|Coal       EJ    NaN   NaN     2
+    SE|Elec       GWh   NaN   NaN     3
+    dtype: int64
+
+    >>> extractlevel(s, variable=r"SE\\|(?P<type>.*?)(?:\\|(?P<fuel>.*?))?", regex=True)
+    unit  type  fuel
+    GWh   Elec  Bio     0
+    GWh   Elec  Coal    1
+    GWh   Elec  NaN     3
     dtype: int64
 
     >>> s = Series(range(3), ["SE|Elec|Bio", "SE|Elec|Coal", "PE|Coal"])
-    >>> extractlevel(s, "SE|{{type}}|{{fuel}}", drop=True)
+    >>> extractlevel(s, "SE|{{type}}|{{fuel}}")
     type  fuel
     Elec  Bio     0
           Coal    1
@@ -794,19 +830,31 @@ def extractlevel(
     See also
     --------
     formatlevel
+
+    .. versionchanged:: 0.5.0
+        *drop* replaced by *keep* and default changed to not keep.
+        *regex* added.
     """
+    if drop is not None:
+        warnings.warn(
+            "Argument `drop` is deprecated (use `keep` instead)", DeprecationWarning
+        )
+        keep = not drop
+
     if isinstance(index_or_data, Index):
         index_or_data, identifiers = _extractlevel(
-            index_or_data, template, drop, **templates
+            index_or_data, template, keep=keep, regex=regex, **templates
         )
     else:
         index, identifiers = _extractlevel(
-            get_axis(index_or_data, axis), template, drop, **templates
+            get_axis(index_or_data, axis), template, keep=keep, regex=regex, **templates
         )
         index_or_data = index_or_data.set_axis(index, axis=axis)
 
     if dropna:
-        index_or_data = dropnalevel(index_or_data, subset=identifiers, axis=axis)
+        index_or_data = dropnalevel(
+            index_or_data, subset=identifiers, how="all", axis=axis
+        )
 
     return index_or_data
 

diff --git a/tests/test_accessors.py b/tests/test_accessors.py
@@ -142,6 +142,30 @@ def test_aggregate(mdf):
     )
 
 
+def test_extract():
+    midx = MultiIndex.from_arrays(
+        [["e|foo", "e|bar", "bar"], [1, 2, 3]], names=["var", "num"]
+    )
+
+    assert_index_equal(
+        midx.pix.extract(var="{e}|{typ}"),
+        MultiIndex.from_arrays(
+            [[1, 2], ["e", "e"], ["foo", "bar"]],
+            names=["num", "e", "typ"],
+        ),
+    )
+
+    # drop=False
+    with pytest.warns(DeprecationWarning):
+        assert_index_equal(
+            midx.pix.extract(var="{e}|{typ}", drop=False),
+            MultiIndex.from_arrays(
+                [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
+                names=["var", "num", "e", "typ"],
+            ),
+        )
+
+
 def test_add_zeros_like(mdf):
     reference = MultiIndex.from_arrays(
         [["foo", "foo", "bar", "baz"], [1, 2, 3, 4], ["a", "b", "c", "d"]],

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -226,8 +226,8 @@ def test_extractlevel(mdf, mseries, midx):
     mseries = mseries.set_axis(midx)
 
     expected_idx = MultiIndex.from_arrays(
-        [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
-        names=["var", "num", "e", "typ"],
+        [[1, 2], ["e", "e"], ["foo", "bar"]],
+        names=["num", "e", "typ"],
     )
 
     assert_index_equal(extractlevel(midx, var="{e}|{typ}"), expected_idx)
@@ -248,32 +248,27 @@ def test_extractlevel_options(mdf):
     )
     mdf_t = mdf.T.set_axis(midx, axis=1)
 
-    # drop=True
+    # keep=True
     assert_index_equal(
-        extractlevel(midx, var="{e}|{typ}", drop=True),
+        extractlevel(midx, var="{e}|{typ}", keep=True),
         MultiIndex.from_arrays(
-            [[1, 2], ["e", "e"], ["foo", "bar"]],
-            names=["num", "e", "typ"],
+            [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
+            names=["var", "num", "e", "typ"],
         ),
     )
 
     # dropna=False
     assert_index_equal(
         extractlevel(midx, var="{e}|{typ}", dropna=False),
         MultiIndex.from_arrays(
-            [
-                ["e|foo", "e|bar", "bar"],
-                [1, 2, 3],
-                ["e", "e", nan],
-                ["foo", "bar", nan],
-            ],
-            names=["var", "num", "e", "typ"],
+            [[1, 2, 3], ["e", "e", nan], ["foo", "bar", nan]],
+            names=["num", "e", "typ"],
         ),
     )
 
     # axis=1
     assert_frame_equal(
-        extractlevel(mdf_t, var="{e}|{typ}", drop=True, axis=1),
+        extractlevel(mdf_t, var="{e}|{typ}", axis=1),
         mdf_t.iloc[:, [0, 1]].set_axis(
             MultiIndex.from_arrays(
                 [[1, 2], ["e", "e"], ["foo", "bar"]],
@@ -283,6 +278,25 @@ def test_extractlevel_options(mdf):
         ),
     )
 
+    # regex
+    assert_index_equal(
+        extractlevel(midx, var=r"((?P<e>.*?)\|)?(?P<typ>.*?)", regex=True),
+        MultiIndex.from_arrays(
+            [[1, 2, 3], ["e", "e", nan], ["foo", "bar", "bar"]],
+            names=["num", "e", "typ"],
+        ),
+    )
+
+    # drop=True
+    with pytest.warns(DeprecationWarning):
+        assert_index_equal(
+            extractlevel(midx, var="{e}|{typ}", drop=False),
+            MultiIndex.from_arrays(
+                [["e|foo", "e|bar"], [1, 2], ["e", "e"], ["foo", "bar"]],
+                names=["var", "num", "e", "typ"],
+            ),
+        )
+
     with pytest.raises(ValueError):
         # mdf does not have the var level
         extractlevel(mdf, var="{e}|{typ}")