diff --git a/docs/source/conf.py b/docs/source/conf.py index c49f083..308422f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,6 +55,9 @@ "member-order": "bysource", } +autodoc_class_signature = "separated" +autodoc_default_options = {"exclude-members": "__new__"} + autosectionlabel_prefix_document = True toc_object_entries_show_parents = "all" diff --git a/docs/source/examples.md b/docs/source/examples.md index 50790ac..37be5b4 100644 --- a/docs/source/examples.md +++ b/docs/source/examples.md @@ -11,7 +11,6 @@ Some examples how to use pydiverse.transform: * [Best practices / beware the flatfile & embrace working with entities](/examples/best_practices_entities) ```{toctree} -/quickstart /examples/joining /examples/aggregations /examples/window_functions diff --git a/docs/source/examples/aggregations.md b/docs/source/examples/aggregations.md index c7d5104..5d4185e 100644 --- a/docs/source/examples/aggregations.md +++ b/docs/source/examples/aggregations.md @@ -8,8 +8,8 @@ from pydiverse.transform.extended import * tbl1 = pdt.Table(dict(a=[1, 1, 2], b=[4, 5, 6])) -tbl1 >> summarize(sum_a=sum(a), sum_b=sum(b)) >> show() -tbl1 >> group_by(tbl1.a) >> summarize(sum_b=sum(b)) >> show() +tbl1 >> summarize(sum_a=a.sum(), sum_b=b.sum()) >> show() +tbl1 >> group_by(tbl1.a) >> summarize(sum_b=b.sum()) >> show() ``` Typical aggregation functions are `sum()`, `mean()`, `count()`, `min()`, `max()`, `any()`, and `all()`. diff --git a/docs/source/reference/api.rst b/docs/source/reference/api.rst index 74e37de..a3ade8d 100644 --- a/docs/source/reference/api.rst +++ b/docs/source/reference/api.rst @@ -8,10 +8,27 @@ API verbs operators/index targets + types + + +.. currentmodule:: pydiverse.transform Table ----- -.. currentmodule:: pydiverse.transform .. autoclass:: Table - :noindex: + +ColExpr +------- + +.. autoclass:: ColExpr + :members: dtype + :exclude-members: __new__, __init__ + +Col +--- + +.. autoclass:: Col + :no-index: + :members: export + :exclude-members: __new__, __init__ diff --git a/docs/source/reference/operators/aggregation.rst b/docs/source/reference/operators/aggregation.rst index f79e182..dfd9e23 100644 --- a/docs/source/reference/operators/aggregation.rst +++ b/docs/source/reference/operators/aggregation.rst @@ -2,6 +2,16 @@ Aggregation =========== +Aggregation functions take a ``partition_by`` and ``filter`` keyword argument. The +``partition_by`` argument can only be given when used within ``mutate``. If a +``partition_by`` argument is given and there is a surrounding ``group_by`` / +``ungroup``, the ``group_by`` is ignored and the value of ``partition_by`` is used. + +.. warning:: + The ``filter`` argument works similar to ``Expr.filter`` in polars. But in contrast + to polars, if all values in a group are ``null`` or the group becomes empty after + filtering, the value of every aggregation function for that group is ``null``, too. + .. currentmodule:: pydiverse.transform.ColExpr .. autosummary:: :toctree: _generated/ diff --git a/docs/source/reference/operators/arithmetic.rst b/docs/source/reference/operators/arithmetic.rst index 9e965e6..ade1eb9 100644 --- a/docs/source/reference/operators/arithmetic.rst +++ b/docs/source/reference/operators/arithmetic.rst @@ -10,6 +10,7 @@ Arithmetic __add__ __floordiv__ + __mod__ __mul__ __neg__ __pos__ diff --git a/docs/source/reference/operators/conditional_logic.rst b/docs/source/reference/operators/conditional_logic.rst new file mode 100644 index 0000000..86dbdf2 --- /dev/null +++ b/docs/source/reference/operators/conditional_logic.rst @@ -0,0 +1,14 @@ +================= +Conditional Logic +================= + +.. currentmodule:: pydiverse.transform + +.. autosummary:: + :toctree: _generated/ + :template: autosummary/short_title.rst + :nosignatures: + + when + coalesce + ColExpr.map diff --git a/docs/source/reference/operators/index.rst b/docs/source/reference/operators/index.rst index 427411c..dba9af8 100644 --- a/docs/source/reference/operators/index.rst +++ b/docs/source/reference/operators/index.rst @@ -16,26 +16,109 @@ Column Operations window sorting_markers horizontal_aggregation + conditional_logic + type_conversion -.. currentmodule:: pydiverse.transform +Expression methods +------------------ -.. autoclass:: ColExpr - :no-index: - :members: dtype +.. currentmodule:: pydiverse.transform.ColExpr .. autosummary:: - :toctree: _generated/ - :template: autosummary/short_title.rst - :nosignatures: + :nosignatures: + + __add__ + __and__ + __eq__ + __floordiv__ + __ge__ + __gt__ + __invert__ + __le__ + __lt__ + __mod__ + __mul__ + __ne__ + __neg__ + __or__ + __pos__ + __pow__ + __sub__ + __truediv__ + __xor__ + abs + all + any + ascending + cast + ceil + count + dense_rank + descending + dt.day + dt.day_of_week + dt.day_of_year + dt.hour + dt.microsecond + dt.millisecond + dt.minute + dt.month + dt.second + dt.year + dur.days + dur.hours + dur.microseconds + dur.milliseconds + dur.minutes + dur.seconds + exp + fill_null + floor + is_in + is_inf + is_nan + is_not_inf + is_not_nan + is_not_null + is_null + log + map + max + mean + min + nulls_first + nulls_last + rank + round + shift + str.contains + str.ends_with + str.len + str.lower + str.replace_all + str.slice + str.starts_with + str.strip + str.to_date + str.to_datetime + str.upper + sum - lit - when +Global functions +---------------- + +.. currentmodule:: pydiverse.transform .. autosummary:: - :toctree: _generated/ - :template: autosummary/short_title.rst - :nosignatures: + :nosignatures: - ColExpr.cast - ColExpr.map + coalesce + count + dense_rank + lit + max + min + rank + row_number + when diff --git a/docs/source/reference/operators/type_conversion.rst b/docs/source/reference/operators/type_conversion.rst new file mode 100644 index 0000000..0d18bfc --- /dev/null +++ b/docs/source/reference/operators/type_conversion.rst @@ -0,0 +1,13 @@ +=============== +Type Conversion +=============== + +.. currentmodule:: pydiverse.transform + +.. autosummary:: + :toctree: _generated/ + :template: autosummary/short_title.rst + :nosignatures: + + lit + ColExpr.cast diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst new file mode 100644 index 0000000..97b69fc --- /dev/null +++ b/docs/source/reference/types.rst @@ -0,0 +1,28 @@ +===== +Types +===== + +.. currentmodule:: pydiverse.transform +.. autosummary:: + :toctree: _generated/ + :nosignatures: + :template: autosummary/short_title.rst + + Dtype + Bool + Date + Datetime + Decimal + Float + Float32 + Float64 + Int + Int8 + Int16 + Int32 + Int64 + String + Uint8 + Uint16 + Uint32 + Uint64 diff --git a/docs/source/reference/verbs.rst b/docs/source/reference/verbs.rst index a719298..9368bf9 100644 --- a/docs/source/reference/verbs.rst +++ b/docs/source/reference/verbs.rst @@ -17,6 +17,7 @@ Verbs filter full_join group_by + inner_join join left_join mutate diff --git a/generate_col_ops.py b/generate_col_ops.py index cfaa8f4..b516e20 100644 --- a/generate_col_ops.py +++ b/generate_col_ops.py @@ -15,6 +15,7 @@ COL_EXPR_PATH = "./src/pydiverse/transform/_internal/tree/col_expr.py" FNS_PATH = "./src/pydiverse/transform/_internal/pipe/functions.py" +API_DOCS_PATH = "./docs/source/reference/operators/index.rst" NAMESPACES = ["str", "dt", "dur"] @@ -78,7 +79,8 @@ def generate_fn_decl( } annotated_kwargs = "".join( - f", {kwarg}: {context_kwarg_annotation[kwarg]} | None = None" + f", {kwarg.name}: {context_kwarg_annotation[kwarg.name]}" + + f"{'' if kwarg.required else ' | None = None'}" for kwarg in op.context_kwargs ) @@ -116,7 +118,7 @@ def generate_fn_body( args = add_vararg_star(args) if op.context_kwargs is not None: - kwargs = "".join(f", {kwarg}={kwarg}" for kwarg in op.context_kwargs) + kwargs = "".join(f", {kwarg.name}={kwarg.name}" for kwarg in op.context_kwargs) else: kwargs = "" @@ -246,3 +248,55 @@ def indent(s: str, by: int) -> str: file.truncate() os.system(f"ruff format {FNS_PATH}") + +with open(API_DOCS_PATH, "r+") as file: + new_file_contents = "" + + for line in file: + new_file_contents += line + if line.startswith("Expression methods"): + new_file_contents += ( + "------------------\n\n" + ".. currentmodule:: pydiverse.transform.ColExpr\n\n" + ".. autosummary::\n" + " :nosignatures:\n\n " + ) + + new_file_contents += "\n ".join( + sorted( + [ + op.name + for op in ops.__dict__.values() + if isinstance(op, Operator) and op.generate_expr_method + ] + + ["rank", "dense_rank", "map", "cast"] + ) + ) + + new_file_contents += ( + "\n\nGlobal functions\n" + "----------------\n\n" + ".. currentmodule:: pydiverse.transform\n\n" + ".. autosummary::\n" + " :nosignatures:\n\n " + ) + + new_file_contents += ( + "\n ".join( + sorted( + [ + op.name + for op in ops.__dict__.values() + if isinstance(op, Operator) and not op.generate_expr_method + ] + + ["when", "lit"] + ) + ) + + "\n" + ) + + break + + file.seek(0) + file.write(new_file_contents) + file.truncate() diff --git a/src/pydiverse/transform/__init__.py b/src/pydiverse/transform/__init__.py index 0a76829..1c565cd 100644 --- a/src/pydiverse/transform/__init__.py +++ b/src/pydiverse/transform/__init__.py @@ -2,11 +2,10 @@ from ._internal.pipe.pipeable import verb from ._internal.pipe.table import Table -from ._internal.tree.col_expr import ColExpr +from ._internal.tree.col_expr import Col, ColExpr from .extended import * from .extended import __all__ as __extended from .types import * from .types import __all__ as __types -__all__ = ["Table", "ColExpr", "verb"] -# __all__ += __extended + __types +__all__ = ["Table", "ColExpr", "Col", "verb"] + __extended + __types diff --git a/src/pydiverse/transform/_internal/backend/polars.py b/src/pydiverse/transform/_internal/backend/polars.py index 73755dc..fc24095 100644 --- a/src/pydiverse/transform/_internal/backend/polars.py +++ b/src/pydiverse/transform/_internal/backend/polars.py @@ -71,14 +71,8 @@ def export( lf.name = nd.name return lf - raise AssertionError - - @staticmethod - def export_col(expr: ColExpr, target: Target) -> pl.Series: - if isinstance(target, Polars): - ... elif isinstance(target, Pandas): - ... + return lf.collect().to_pandas(use_pyarrow_extension_array=True) raise AssertionError diff --git a/src/pydiverse/transform/_internal/backend/table_impl.py b/src/pydiverse/transform/_internal/backend/table_impl.py index fc5cdc9..be4e944 100644 --- a/src/pydiverse/transform/_internal/backend/table_impl.py +++ b/src/pydiverse/transform/_internal/backend/table_impl.py @@ -14,7 +14,7 @@ from pydiverse.transform._internal.ops import ops from pydiverse.transform._internal.ops.op import Ftype from pydiverse.transform._internal.tree.ast import AstNode -from pydiverse.transform._internal.tree.col_expr import Col, ColExpr +from pydiverse.transform._internal.tree.col_expr import Col from pydiverse.transform._internal.tree.types import Dtype try: @@ -75,7 +75,7 @@ def from_resource( if hasattr(resource, "name"): name = resource.name else: - name = "?" + name = "" if backend is None or isinstance(backend, Polars): from pydiverse.transform._internal.backend.polars import PolarsImpl @@ -120,9 +120,6 @@ def export( schema_overrides: dict[Col, Any], ) -> Any: ... - @classmethod - def export_col(cls, expr: ColExpr, target: Target): ... - @classmethod def get_impl(cls, op: Operator, sig: Sequence[Dtype]) -> Any: if (impl := cls.impl_store.get_impl(op, sig)) is not None: @@ -140,6 +137,15 @@ def get_impl(cls, op: Operator, sig: Sequence[Dtype]) -> Any: ) from err +def get_backend(nd: AstNode) -> type[TableImpl]: + from pydiverse.transform._internal.tree.verbs import Verb + + if isinstance(nd, Verb): + return get_backend(nd.child) + assert isinstance(nd, TableImpl) and nd is not TableImpl + return nd.__class__ + + with TableImpl.impl_store.impl_manager as impl: @impl(ops.add) diff --git a/src/pydiverse/transform/_internal/backend/targets.py b/src/pydiverse/transform/_internal/backend/targets.py index 7e1db3e..bf66519 100644 --- a/src/pydiverse/transform/_internal/backend/targets.py +++ b/src/pydiverse/transform/_internal/backend/targets.py @@ -11,7 +11,7 @@ class Target: ... class Polars(Target): - def __init__(self, *, lazy: bool = True) -> None: + def __init__(self, *, lazy: bool = False) -> None: self.lazy = lazy diff --git a/src/pydiverse/transform/_internal/ops/op.py b/src/pydiverse/transform/_internal/ops/op.py index 0030903..2013835 100644 --- a/src/pydiverse/transform/_internal/ops/op.py +++ b/src/pydiverse/transform/_internal/ops/op.py @@ -1,5 +1,6 @@ from __future__ import annotations +import dataclasses import enum from collections.abc import Sequence from typing import Any @@ -14,6 +15,12 @@ class Ftype(enum.IntEnum): WINDOW = 3 +@dataclasses.dataclass(slots=True) +class ContextKwarg: + name: str + required: bool + + class Operator: __slots__ = ( "name", @@ -31,7 +38,7 @@ class Operator: trie: SignatureTrie signatures: list[Signature] ftype: Ftype - context_kwargs: list[str] + context_kwargs: list[ContextKwarg] param_names: list[str] default_values: list[str] | None generate_expr_method: bool @@ -42,7 +49,7 @@ def __init__( name: str, *signatures: Signature, ftype: Ftype = Ftype.ELEMENT_WISE, - context_kwargs: list[str] | None = None, + context_kwargs: list[ContextKwarg] | None = None, param_names: list[str] | None = None, default_values: list[Any] | None = None, generate_expr_method: bool = True, diff --git a/src/pydiverse/transform/_internal/ops/ops/aggregation.py b/src/pydiverse/transform/_internal/ops/ops/aggregation.py index c7b53c0..4688aee 100644 --- a/src/pydiverse/transform/_internal/ops/ops/aggregation.py +++ b/src/pydiverse/transform/_internal/ops/ops/aggregation.py @@ -1,6 +1,6 @@ from __future__ import annotations -from pydiverse.transform._internal.ops.op import Ftype, Operator +from pydiverse.transform._internal.ops.op import ContextKwarg, Ftype, Operator from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import ( COMPARABLE, @@ -25,7 +25,10 @@ def __init__( name, *signatures, ftype=Ftype.AGGREGATE, - context_kwargs=["partition_by", "filter"], + context_kwargs=[ + ContextKwarg("partition_by", False), + ContextKwarg("filter", False), + ], generate_expr_method=generate_expr_method, doc=doc, ) diff --git a/src/pydiverse/transform/_internal/ops/ops/arithmetic.py b/src/pydiverse/transform/_internal/ops/ops/arithmetic.py index 8c5ec74..e044af8 100644 --- a/src/pydiverse/transform/_internal/ops/ops/arithmetic.py +++ b/src/pydiverse/transform/_internal/ops/ops/arithmetic.py @@ -40,4 +40,83 @@ Signature(Decimal(), Decimal(), return_type=Decimal()), ) -floordiv = Operator("__floordiv__", Signature(Int(), Int(), return_type=Int())) +floordiv = Operator( + "__floordiv__", + Signature(Int(), Int(), return_type=Int()), + doc=""" +Integer division. + +Warning +------- +The behavior of this operator differs from polars and python. Polars and python +always round towards negative infinity, whereas pydiverse.transform always +rounds towards zero, regardless of the sign. This behavior matches the one of C, +C++ and all currently supported SQL backends. + +See also +-------- +__mod__ + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [65, -65, 65, -65], +... "b": [7, 7, -7, -7], +... } +... ) +>>> t >> mutate(r=t.a // t.b) >> export(Polars()) +shape: (4, 3) +┌─────┬─────┬─────┐ +│ a ┆ b ┆ r │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞═════╪═════╪═════╡ +│ 65 ┆ 7 ┆ 9 │ +│ -65 ┆ 7 ┆ -9 │ +│ 65 ┆ -7 ┆ -9 │ +│ -65 ┆ -7 ┆ 9 │ +└─────┴─────┴─────┘ +""", +) + +mod = Operator( + "__mod__", + Signature(Int(), Int(), return_type=Int()), + doc=""" +Computes the remainder of integer division. + +Warning +------- +This operator behaves differently than in polars. There are at least two +conventions how `%` and :doc:`// ` +should behave for negative inputs. We follow the one that C, C++ and all +currently supported SQL backends follow. This means that the output has the same +sign as the left hand side of the input, regardless of the right hand side. + +See also +-------- +__floordiv__ + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [65, -65, 65, -65], +... "b": [7, 7, -7, -7], +... } +... ) +>>> t >> mutate(r=t.a % t.b) >> export(Polars()) +shape: (4, 3) +┌─────┬─────┬─────┐ +│ a ┆ b ┆ r │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞═════╪═════╪═════╡ +│ 65 ┆ 7 ┆ 2 │ +│ -65 ┆ 7 ┆ -2 │ +│ 65 ┆ -7 ┆ 2 │ +│ -65 ┆ -7 ┆ -2 │ +└─────┴─────┴─────┘ +""", +) diff --git a/src/pydiverse/transform/_internal/ops/ops/markers.py b/src/pydiverse/transform/_internal/ops/ops/markers.py index 3a94478..d6dbb82 100644 --- a/src/pydiverse/transform/_internal/ops/ops/markers.py +++ b/src/pydiverse/transform/_internal/ops/ops/markers.py @@ -7,13 +7,55 @@ class Marker(Operator): def __init__(self, name: str, doc: str = ""): - super().__init__(name, Signature(D, return_type=D), doc=doc) - - -nulls_first = Marker("nulls_first") - -nulls_last = Marker("nulls_last") - -ascending = Marker("ascending") - -descending = Marker("descending") + super().__init__( + name, + Signature(D, return_type=D), + doc=doc + + """ +Can only be used in expressions given to the `arrange` verb or as as an +`arrange` keyword argument. +""", + ) + + +nulls_first = Marker( + "nulls_first", + doc=""" +Specifies that nulls are placed at the beginning of the ordering. + +This does not mean that nulls are considered to be `less` than any other +element. I.e. if both `nulls_first` and `descending` are given, nulls will still +be placed at the beginning. + +If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is +backend-dependent. +""", +) + +nulls_last = Marker( + "nulls_last", + doc=""" +Specifies that nulls are placed at the end of the ordering. + +This does not mean that nulls are considered to be `greater` than any other +element. I.e. if both `nulls_last` and `descending` are given, nulls will still +be placed at the end. + +If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is +backend-dependent. +""", +) + +ascending = Marker( + "ascending", + doc=""" +The default ordering. +""", +) + +descending = Marker( + "descending", + doc=""" +Reverses the default ordering. +""", +) diff --git a/src/pydiverse/transform/_internal/ops/ops/numeric.py b/src/pydiverse/transform/_internal/ops/ops/numeric.py index bc7662a..6260323 100644 --- a/src/pydiverse/transform/_internal/ops/ops/numeric.py +++ b/src/pydiverse/transform/_internal/ops/ops/numeric.py @@ -4,9 +4,6 @@ from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import NUMERIC, Bool, Decimal, Float, Int -mod = Operator("__mod__", Signature(Int(), Int(), return_type=Int())) - - pow = Operator( "__pow__", Signature(Int(), Int(), return_type=Float()), diff --git a/src/pydiverse/transform/_internal/ops/ops/window.py b/src/pydiverse/transform/_internal/ops/ops/window.py index cff39e6..1965b3f 100644 --- a/src/pydiverse/transform/_internal/ops/ops/window.py +++ b/src/pydiverse/transform/_internal/ops/ops/window.py @@ -2,7 +2,7 @@ from typing import Any -from pydiverse.transform._internal.ops.op import Ftype, Operator +from pydiverse.transform._internal.ops.op import ContextKwarg, Ftype, Operator from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import D, Int, Tvar @@ -21,7 +21,10 @@ def __init__( name, *signatures, ftype=Ftype.WINDOW, - context_kwargs=["partition_by", "arrange"], + context_kwargs=[ + ContextKwarg("partition_by", False), + ContextKwarg("arrange", True), + ], param_names=param_names, default_values=default_values, generate_expr_method=generate_expr_method, @@ -39,6 +42,33 @@ def __init__( row_number = Window("row_number", Signature(return_type=Int())) -rank = Window("rank", Signature(return_type=Int())) +rank = Window( + "rank", + Signature(return_type=Int()), + doc=""" +The number of strictly smaller elements in the column plus one. + +This is the same as ``rank("min")`` in polars. + +Examples +-------- +>>> t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) +>>> t >> mutate(b=pdt.rank(arrange=t.a)) >> export(Polars(lazy=False)) +shape: (7, 2) +┌─────┬─────┐ +│ a ┆ b │ +│ --- ┆ --- │ +│ i64 ┆ i64 │ +╞═════╪═════╡ +│ 3 ┆ 3 │ +│ 1 ┆ 1 │ +│ 4 ┆ 4 │ +│ 1 ┆ 1 │ +│ 5 ┆ 6 │ +│ 9 ┆ 7 │ +│ 4 ┆ 4 │ +└─────┴─────┘ +""", +) dense_rank = Window("dense_rank", Signature(return_type=Int())) diff --git a/src/pydiverse/transform/_internal/pipe/functions.py b/src/pydiverse/transform/_internal/pipe/functions.py index 2e2428b..77f294e 100644 --- a/src/pydiverse/transform/_internal/pipe/functions.py +++ b/src/pydiverse/transform/_internal/pipe/functions.py @@ -99,7 +99,7 @@ def count( def dense_rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: """""" @@ -169,9 +169,32 @@ def min(arg: ColExpr, *args: ColExpr) -> ColExpr: def rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: - """""" + """ + The number of strictly smaller elements in the column plus one. + + This is the same as ``rank("min")`` in polars. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) + >>> t >> mutate(b=pdt.rank(arrange=t.a)) >> export(Polars(lazy=False)) + shape: (7, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + │ 1 ┆ 1 │ + │ 4 ┆ 4 │ + │ 1 ┆ 1 │ + │ 5 ┆ 6 │ + │ 9 ┆ 7 │ + │ 4 ┆ 4 │ + └─────┴─────┘ + """ return ColFn(ops.rank, partition_by=partition_by, arrange=arrange) @@ -179,7 +202,7 @@ def rank( def row_number( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: """""" diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index cb19317..ce1f6ab 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -4,18 +4,13 @@ import uuid from typing import Any, Literal, overload -import pandas as pd -import polars as pl - from pydiverse.transform._internal import errors -from pydiverse.transform._internal.backend.table_impl import TableImpl -from pydiverse.transform._internal.backend.targets import Pandas, Polars, Target +from pydiverse.transform._internal.backend.table_impl import TableImpl, get_backend +from pydiverse.transform._internal.backend.targets import Polars, Target from pydiverse.transform._internal.errors import FunctionTypeError from pydiverse.transform._internal.ops.op import Ftype -from pydiverse.transform._internal.pipe.c import C from pydiverse.transform._internal.pipe.pipeable import Pipeable, verb from pydiverse.transform._internal.pipe.table import Table -from pydiverse.transform._internal.tree.ast import AstNode from pydiverse.transform._internal.tree.col_expr import ( Col, ColExpr, @@ -37,7 +32,6 @@ SliceHead, Summarize, Ungroup, - Verb, ) __all__ = [ @@ -69,6 +63,53 @@ def alias(new_name: str | None = None) -> Pipeable: ... @verb def alias(table: Table, new_name: str | None = None) -> Pipeable: + """ + Changes the name of the current table and resets column references. + + That column references are reset means that the resulting table is not + considered to be derived from the input table, i.e. one cannot use columns + from the input table in subsequent operations on the result table. However, + the reset of column references is necessary before performing a self-join. + + :param new_name: + The new name assigned to the table. If this is ``None``, the table + retains its previous name. + + Examples + -------- + + A self join without applying ``alias`` before raises an exception: + + >>> t = pdt.Table({"a": [4, 2, 1, 4], "b": ["l", "g", "uu", "-- r"]}) + >>> t >> join(t, t.a == t.a, how="inner", suffix="_right") + ValueError: table `` occurs twice in the table tree. + hint: To join two tables derived from a common table, apply `>> alias()` to one of + them before the join. + + By applying ``alias`` to the right table and storing the result in a new variable, + the self join succeeds. + + >>> ( + ... t + ... >> join(s := t >> alias(), t.a == s.a, how="inner", suffix="_right") + ... >> export(Polars()) + ... ) + shape: (6, 4) + ┌─────┬────────┬─────────┬─────────┐ + │ a ┆ b ┆ a_right ┆ b_right │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════════╪═════════╡ + │ 4 ┆ l ┆ 4 ┆ l │ + │ 4 ┆ -- r ┆ 4 ┆ l │ + │ 2 ┆ g ┆ 2 ┆ g │ + │ 1 ┆ uu ┆ 1 ┆ uu │ + │ 4 ┆ l ┆ 4 ┆ -- r │ + │ 4 ┆ -- r ┆ 4 ┆ -- r │ + └─────┴────────┴─────────┴─────────┘ + """ + if new_name is None: new_name = table._ast.name new = copy.copy(table) @@ -108,6 +149,46 @@ def collect(target: Target | None = None) -> Pipeable: ... @verb def collect(table: Table, target: Target | None = None) -> Pipeable: + """ + Execute all accumulated operations and write the result to a new Table. + + This verb is only for polars-backed tables. All operations lazily stored in the + table are executed and a new table containing the result is returned. The returned + table always stored the data in a polars LazyFrame. One can choose whether the + following operations on the table are executed via polars or DuckDB on the + LazyFrame (see also :doc:`/examples/duckdb_polars_parquet`). + + :param target: + The execution engine to be used from here on. Can be either ``Polars`` or + ``DuckDb``. + + Examples + -------- + Here, ``collect`` does not change anything in the result, but the ``mutate`` is + executed on the DataFrame when ``collect`` is called, whereas the ``arrange`` is + only executed when ``export`` is called. Without ``collect``, the ``mutate`` would + only have been executed with the ``export``, too. + + >>> t = pdt.Table({"a": [4, 2, 1, 4], "b": ["l", "g", "uu", "-- r"]}) + >>> ( + ... t + ... >> mutate(z=t.a + t.b.str.len()) + ... >> collect() + ... >> arrange(C.z, t.a) + ... >> export(Polars()) + ... ) + shape: (4, 3) + ┌─────┬────────┬─────┐ + │ a ┆ b ┆ z │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╡ + │ 1 ┆ uu ┆ 3 │ + │ 2 ┆ g ┆ 3 │ + │ 4 ┆ l ┆ 5 │ + │ 4 ┆ -- r ┆ 10 │ + └─────┴────────┴─────┘ + """ errors.check_arg_type(Target | None, "collect", "target", target) df = table >> select(*table._cache.all_cols.values()) >> export(Polars(lazy=False)) @@ -137,93 +218,58 @@ def export(target: Target, *, schema_overrides: dict | None = None) -> Pipeable: @verb def export( - data: Table | ColExpr, - target: Target, - *, - schema_overrides: dict[Col, Any] | None = None, + data: Table, target: Target, *, schema_overrides: dict[Col, Any] | None = None ) -> Pipeable: """Convert a pydiverse.transform Table to a data frame. - Parameters - ---------- - target - Can currently be either a `Polars` or `Pandas` object. For polars, one can - specify whether a DataFrame or LazyFrame is returned via the `lazy` kwarg. - If `lazy=True`, no actual computations are performed, they just get stored in + :param target: + Can currently be either a ``Polars`` or ``Pandas`` object. For polars, one can + specify whether a DataFrame or LazyFrame is returned via the ``lazy`` kwarg. + If ``lazy=True``, no actual computations are performed, they just get stored in the LazyFrame. - schema_overrides + :param schema_overrides: A dictionary of columns to backend-specific data types. This controls which data types are used when writing to the backend. Because the data types are not constrained by pydiverse.transform's type system, this may sometimes be - preferred over a `cast`. - - Returns - ------- - A polars or pandas DataFrame / LazyFrame or a series. - - - Note - ---- - This verb can be used with a Table or column expression on the left. When applied to - a column expression, all tables containing columns in the expression tree are - gathered and the expression is exported with respect to the most recent table. This - means that there has to be one column in the expression tree, whose table is a - common ancestor of all others. If this is not the case, the export fails. Anonymous - columns (`C`-columns) can be used in the expression and are interpreted with respect - to this common ancestor table. - """ - if isinstance(data, ColExpr): - # Find the common ancestor of all AST nodes of columns appearing in the - # expression. - nodes: dict[AstNode, int] = {} - for col in data.iter_subtree(): - if isinstance(col, Col): - nodes[col._ast] = 0 - - for nd in list(nodes.keys()): - gen = nd.iter_subtree_preorder() - try: - descendant = next(gen) - while True: - if descendant != nd and descendant in nodes: - nodes[descendant] += 1 - descendant = gen.send(True) - else: - descendant = next(gen) - except StopIteration: - ... - - indeg0 = (nd for nd, cnt in nodes.items() if cnt == 0) - root = next(indeg0) - try: - next(indeg0) - # The AST nodes have a common ancestor if and only if there is exactly one - # node that has not been reached by another one. - raise ValueError( - "cannot export a column expression containing columns without a common " - "ancestor" - ) - except StopIteration: - ... - - uid = uuid.uuid1() - table = from_ast(root) - df = ( - table - >> _mutate([str(uid)], [data], [uid]) - >> select(C[str(uid)]) - >> export(target) - ) - - if isinstance(target, Polars): - if isinstance(df, pl.LazyFrame): - df = df.collect() - return df.get_column(str(uid)) - elif isinstance(target, Pandas): - assert isinstance(df, pd.DataFrame) - return pd.Series(df[str(uid)]) - raise AssertionError + preferred over a cast. + + :return: + A polars or pandas DataFrame / LazyFrame. + + Examples + -------- + + >>> t1 = pdt.Table( + ... { + ... "a": [3, 1, 4, 1, 5, 9], + ... "b": [2.465, 0.22, -4.477, 10.8, -81.2, 0.0], + ... "c": ["a,", "bcbc", "pydiverse", "transform", "' '", "-22"], + ... } + ... ) + >>> t1 >> export(Pandas()) + a b c + 0 3 2.465 a, + 1 1 0.22 bcbc + 2 4 -4.477 pydiverse + 3 1 10.8 transform + 4 5 -81.2 ' ' + 5 9 0.0 -22 + >>> t1 >> export(Polars()) + shape: (6, 3) + ┌─────┬────────┬───────────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪════════╪═══════════╡ + │ 3 ┆ 2.465 ┆ a, │ + │ 1 ┆ 0.22 ┆ bcbc │ + │ 4 ┆ -4.477 ┆ pydiverse │ + │ 1 ┆ 10.8 ┆ transform │ + │ 5 ┆ -81.2 ┆ ' ' │ + │ 9 ┆ 0.0 ┆ -22 │ + └─────┴────────┴───────────┘ + """ # TODO: allow stuff like pdt.Int(): pl.Uint32() in schema_overrides and resolve that # to columns @@ -240,6 +286,13 @@ def build_query() -> Pipeable: ... @verb def build_query(table: Table) -> Pipeable: + """ + Compiles the operations accumulated on the current table to a SQL query. + + :returns: + The SQL query as a string. + """ + table = table >> alias() SourceBackend: type[TableImpl] = get_backend(table._ast) return SourceBackend.build_query(table._ast, table._cache.select) @@ -251,6 +304,10 @@ def show_query() -> Pipeable: ... @verb def show_query(table: Table) -> Pipeable: + """ + Prints the compiled SQL query to stdout. + """ + if query := table >> build_query(): print(query) else: @@ -265,6 +322,29 @@ def select(*cols: Col | ColName) -> Pipeable: ... @verb def select(table: Table, *cols: Col | ColName) -> Pipeable: + """ + Selects a subset of columns. + + :param cols: + The columns to be included in the resulting table. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> select(t.a) >> export(Polars()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 2 │ + │ 6 │ + │ 4 │ + └─────┘ + """ + errors.check_vararg_type(Col | ColName, "select", *cols) new = copy.copy(table) @@ -282,11 +362,32 @@ def drop(*cols: Col | ColName) -> Pipeable: ... @verb def drop(table: Table, *cols: Col | ColName) -> Pipeable: + """ + Removes a subset of the columns. + + :param cols: + The columns to be removed. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> drop(t.a) >> export(Polars()) + shape: (4, 1) + ┌───────┐ + │ b │ + │ --- │ + │ str │ + ╞═══════╡ + │ lll │ + │ g │ + │ u0 │ + │ __**_ │ + └───────┘ + """ errors.check_vararg_type(Col | ColName, "drop", *cols) dropped_uuids = {preprocess_arg(col, table)._uuid for col in cols} - return select( - table, + return table >> select( *(col for col in table._cache.select if col._uuid not in dropped_uuids), ) @@ -297,6 +398,66 @@ def rename(name_map: dict[str, str]) -> Pipeable: ... @verb def rename(table: Table, name_map: dict[str, str]) -> Pipeable: + """ + Renames columns. + + :param name_map: + A dictionary assigning some columns (given by their name) new names. + + Examples + -------- + Renaming one column: + + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> rename({"a": "h"}) >> export(Polars()) + shape: (4, 2) + ┌─────┬───────┐ + │ h ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═══════╡ + │ 3 ┆ lll │ + │ 2 ┆ g │ + │ 6 ┆ u0 │ + │ 4 ┆ __**_ │ + └─────┴───────┘ + + Here is a more subtle example: As long as there are no two equal column names in the + result table, one can give names to columns that already exist in the table. In the + following example, the names of columns *a* and *b* are swapped. + + >>> s = t >> rename({"a": "b", "b": "a"}) >> show() + Table , backend: PolarsImpl + shape: (4, 2) + ┌─────┬───────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═══════╡ + │ 3 ┆ lll │ + │ 2 ┆ g │ + │ 6 ┆ u0 │ + │ 4 ┆ __**_ │ + └─────┴───────┘ + + When using the column ``t.a`` in an expression derived from *s* now, it + still refers to the same column, which now has the name *b*. The anonymous + column ``C.a``, however, refers to the column with name *a* in the *current* + table. + + >>> s >> mutate(u=t.a, v=C.a) >> export(Polars()) + shape: (4, 4) + ┌─────┬───────┬─────┬───────┐ + │ b ┆ a ┆ u ┆ v │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═══════╪═════╪═══════╡ + │ 3 ┆ lll ┆ 3 ┆ lll │ + │ 2 ┆ g ┆ 2 ┆ g │ + │ 6 ┆ u0 ┆ 6 ┆ u0 │ + │ 4 ┆ __**_ ┆ 4 ┆ __**_ │ + └─────┴───────┴─────┴───────┘ + """ errors.check_arg_type(dict, "rename", "name_map", name_map) if len(name_map) == 0: return table @@ -323,23 +484,40 @@ def mutate(**kwargs: ColExpr) -> Pipeable: ... @verb def mutate(table: Table, **kwargs: ColExpr) -> Pipeable: + """ + Adds new columns to the table. + + :param kwargs: + Each key is the name of a new column, and its value is the column + expression defining the new column. + + Examples + -------- + >>> t1 = pdt.Table( + ... dict(a=[3, 1, 4, 1, 5, 9], b=[2.465, 0.22, -4.477, 10.8, -81.2, 0.0]) + ... ) + >>> t1 >> mutate(u=t1.a * t1.b) >> export(Polars()) + shape: (6, 3) + ┌─────┬────────┬─────────┐ + │ a ┆ b ┆ u │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪════════╪═════════╡ + │ 3 ┆ 2.465 ┆ 7.395 │ + │ 1 ┆ 0.22 ┆ 0.22 │ + │ 4 ┆ -4.477 ┆ -17.908 │ + │ 1 ┆ 10.8 ┆ 10.8 │ + │ 5 ┆ -81.2 ┆ -406.0 │ + │ 9 ┆ 0.0 ┆ 0.0 │ + └─────┴────────┴─────────┘ + """ + if len(kwargs) == 0: return table - return table >> _mutate(*map(list, zip(*kwargs.items(), strict=True))) - -@verb -def _mutate( - table: Table, - names: list[str], - values: list[ColExpr], - uuids: list[uuid.UUID] | None = None, -) -> Pipeable: + names, values = map(list, zip(*kwargs.items(), strict=True)) + uuids = [uuid.uuid1() for _ in names] new = copy.copy(table) - if uuids is None: - uuids = [uuid.uuid1() for _ in names] - else: - assert len(uuids) == len(names) new._ast = Mutate( table._ast, @@ -360,6 +538,27 @@ def filter(*predicates: ColExpr[Bool]) -> Pipeable: ... @verb def filter(table: Table, *predicates: ColExpr[Bool]) -> Pipeable: + """ + Selects a subset of rows based on some condition. + + :param predicates: + Column expressions of boolean type to filter by. Only rows where all expressions + are true are included in the result. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> filter(t.a <= 4, ~t.b.str.contains("_")) >> export(Polars()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 3 ┆ lll │ + │ 2 ┆ g │ + └─────┴─────┘ + """ if len(predicates) == 0: return table @@ -397,6 +596,53 @@ def arrange(*order_by: ColExpr) -> Pipeable: ... @verb def arrange(table: Table, *order_by: ColExpr) -> Pipeable: + """ + Sorts the rows of the table. + + :param order_by: + Column expressions to sort by. The order of the expressions determines + the priority. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "r": [2, 7, 3, 2, 6, None, 4], + ... "s": ["l", "o", "a", "c", "s", "---", "3"], + ... "p": [0.655, -4.33, None, 143.6, 0.0, 1.0, 4.5], + ... } + ... ) + >>> t >> arrange(t.r.nulls_first(), t.p) >> export(Polars()) + shape: (7, 3) + ┌──────┬─────┬───────┐ + │ r ┆ s ┆ p │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞══════╪═════╪═══════╡ + │ null ┆ --- ┆ 1.0 │ + │ 2 ┆ l ┆ 0.655 │ + │ 2 ┆ c ┆ 143.6 │ + │ 3 ┆ a ┆ null │ + │ 4 ┆ 3 ┆ 4.5 │ + │ 6 ┆ s ┆ 0.0 │ + │ 7 ┆ o ┆ -4.33 │ + └──────┴─────┴───────┘ + >>> t >> arrange(t.p.nulls_last().descending(), t.s) >> export(Polars()) + shape: (7, 3) + ┌──────┬─────┬───────┐ + │ r ┆ s ┆ p │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞══════╪═════╪═══════╡ + │ 2 ┆ c ┆ 143.6 │ + │ 4 ┆ 3 ┆ 4.5 │ + │ null ┆ --- ┆ 1.0 │ + │ 2 ┆ l ┆ 0.655 │ + │ 6 ┆ s ┆ 0.0 │ + │ 7 ┆ o ┆ -4.33 │ + │ 3 ┆ a ┆ null │ + └──────┴─────┴───────┘ + """ if len(order_by) == 0: return table @@ -418,6 +664,24 @@ def group_by(table: Table, *cols: Col | ColName, add=False) -> Pipeable: ... @verb def group_by(table: Table, *cols: Col | ColName, add=False) -> Pipeable: + """ + Add a grouping state to the table. + + :param cols: + The columns to group by. + + :param add: + If ``add=True``, the given columns are added to the set of columns the table is + currently grouped by. If ``add=False``, the current grouping state is replaced + by the given columns. + + This verb does not modify the table itself, but only adds a grouping in the + background. The number of rows is only reduced when + :doc:`summarize ` + is called. The + :doc:`ungroup ` + verb can be used to clear the grouping state. + """ if len(cols) == 0: return table @@ -443,6 +707,46 @@ def ungroup() -> Pipeable: ... @verb def ungroup(table: Table) -> Pipeable: + """ + Clear the grouping state of the table. + + Examples + -------- + In the following example, ``group_by`` and ``ungroup`` are used to specify that each + aggregation function between them uses the column ``t.c`` for grouping. + + >>> t = pdt.Table( + ... { + ... "a": [1.2, 5.077, -2.29, -0.0, 3.0, -7.7], + ... "b": ["a ", "transform", "pipedag", "cdegh", " -ade ", " pq"], + ... "c": [True, False, None, None, True, True], + ... "d": [4, 4, 2, 0, 1, 0], + ... } + ... ) + >>> ( + ... t + ... >> group_by(t.c) + ... >> mutate( + ... u=t.b.str.len().max() + t.a.min(), + ... v=t.d.mean(filter=t.a >= 0), + ... ) + ... >> ungroup() + ... >> export(Polars()) + ... ) + shape: (6, 6) + ┌───────┬───────────┬───────┬─────┬────────┬─────┐ + │ a ┆ b ┆ c ┆ d ┆ u ┆ v │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ str ┆ bool ┆ i64 ┆ f64 ┆ f64 │ + ╞═══════╪═══════════╪═══════╪═════╪════════╪═════╡ + │ 1.2 ┆ a ┆ true ┆ 4 ┆ -0.7 ┆ 2.5 │ + │ 5.077 ┆ transform ┆ false ┆ 4 ┆ 14.077 ┆ 4.0 │ + │ -2.29 ┆ pipedag ┆ null ┆ 2 ┆ 4.71 ┆ 0.0 │ + │ -0.0 ┆ cdegh ┆ null ┆ 0 ┆ 4.71 ┆ 0.0 │ + │ 3.0 ┆ -ade ┆ true ┆ 1 ┆ -0.7 ┆ 2.5 │ + │ -7.7 ┆ pq ┆ true ┆ 0 ┆ -0.7 ┆ 2.5 │ + └───────┴───────────┴───────┴─────┴────────┴─────┘ + """ new = copy.copy(table) new._ast = Ungroup(table._ast) new._cache = copy.copy(table._cache) @@ -456,23 +760,55 @@ def summarize(**kwargs: ColExpr) -> Pipeable: ... @verb def summarize(table: Table, **kwargs: ColExpr) -> Pipeable: - return table >> _summarize(*map(list, zip(*kwargs.items(), strict=True))) - - -@verb -def _summarize( - table: Table, - names: list[str], - values: list[ColExpr], - uuids: list[uuid.UUID] | None = None, -) -> Pipeable: + """ + Computes aggregates over groups of rows. + + :param kwargs: + Each key is the name of a new column, and its value is the column + expression defining the new column. The column expression may not contain + columns that are neither part of the grouping columns nor wrapped in an + aggregation function. + + + In contrast to :doc:`pydiverse.transform.mutate`, this verb in general reduces the + number of rows and only keeps the grouping columns and the new columns defined in + the kwargs. One row for each unique combination of values in the grouping columns + is created. + + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [1.2, 5.077, -2.29, -0.0, 3.0, -7.7], + ... "b": ["a ", "transform", "pipedag", "cdegh", " -ade ", " pq"], + ... "c": [True, False, None, None, True, True], + ... } + ... ) + >>> ( + ... t + ... >> group_by(t.c) + ... >> summarize( + ... u=t.b.str.len().mean(), + ... v=t.a.sum(filter=t.a >= 0), + ... ) + ... >> export(Polars()) + ... ) + shape: (3, 3) + ┌───────┬──────────┬───────┐ + │ c ┆ u ┆ v │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ f64 ┆ f64 │ + ╞═══════╪══════════╪═══════╡ + │ true ┆ 4.666667 ┆ 4.2 │ + │ null ┆ 6.0 ┆ -0.0 │ + │ false ┆ 9.0 ┆ 5.077 │ + └───────┴──────────┴───────┘ + """ + names, values = map(list, zip(*kwargs.items(), strict=True)) + uuids = [uuid.uuid1() for _ in names] new = copy.copy(table) - if uuids is None: - uuids = [uuid.uuid1() for _ in names] - else: - assert len(uuids) == len(names) - new._ast = Summarize( table._ast, names, @@ -519,6 +855,37 @@ def slice_head(n: int, *, offset: int = 0) -> Pipeable: ... @verb def slice_head(table: Table, n: int, *, offset: int = 0) -> Pipeable: + """ + Selects a subset of rows based on their index. + + :param n: + The number of rows to select. + + :param offset: + The index of the first row (0-based) that is included in the selection. + + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, 5, 312, -55, 0], + ... "b": ["l", "r", "srq", "---", " "], + ... } + ... ) + >>> t >> slice_head(3, offset=1) >> export(Polars()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 5 ┆ r │ + │ 312 ┆ srq │ + │ -55 ┆ --- │ + └─────┴─────┘ + """ + errors.check_arg_type(int, "slice_head", "n", n) errors.check_arg_type(int, "slice_head", "offset", offset) @@ -554,6 +921,79 @@ def join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Joins two tables on a boolean expression. + + The left table in the join comes through the pipe `>>` operator from the + left. + + :param right: + The right table to join with. + + :param on: + The join condition. See the note below for more information on which expressions + are allowed here. + + :param how: + The join type. + + :param validate: + Only relevant for polars. When set to ``"m:m"``, this does nothing. If set to + ``"1:m"``, it is checked whether each right row matches at most one left row. In + case this does not hold, an error is raised. Symmetrically, if set to ``"m:1"`` + it is checked whether each left row matches at most one right row. If set to + ``"1:1"`` both ``"1:m"`` and ``"m:1"`` are checked. + + :param suffix: + A string that is appended to all column names from the right table. If no suffix + is specified and there are no column name collisions, columns will retain their + original name. If there are name collisions, the name of the right table is + appended to all columns of the right table. If this still does not resolve all + name collisions, additionally an integer is appended to the column names of the + right table. + + + Note + ---- + Not all backends can handle arbitrary boolean expressions in ``on`` with every join + type. + + :polars: + For everything except conjunctions of equalities, it depends on whether polars + ``join_asof`` can handle the join condition. + :postgres: + For full joins, the join condition must be hashable or mergeable. See the + `postgres documentation `_ + for more details. + + Tip + --- + Two tables cannot be joined if one is derived from the other. In particular, before + a self-join, the ``alias`` verb has to be applied to one table. + + Examples + -------- + >>> t1 = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}, name="t1") + >>> t2 = pdt.Table({"a": [4, 4, 1, 7], "b": ["f", "g", "h", "i"]}, name="t2") + >>> t1 >> join(t2, t1.a == t2.a, how="left") >> export(Polars()) + shape: (9, 3) + ┌─────┬──────┬──────┐ + │ a ┆ a_t2 ┆ b_t2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪══════╪══════╡ + │ 3 ┆ null ┆ null │ + │ 1 ┆ 1 ┆ h │ + │ 4 ┆ 4 ┆ f │ + │ 4 ┆ 4 ┆ g │ + │ 1 ┆ 1 ┆ h │ + │ 5 ┆ null ┆ null │ + │ 9 ┆ null ┆ null │ + │ 4 ┆ 4 ┆ f │ + │ 4 ┆ 4 ┆ g │ + └─────┴──────┴──────┘ + """ + errors.check_arg_type(Table, "join", "right", right) errors.check_arg_type(str | None, "join", "suffix", suffix) errors.check_literal_type(["inner", "left", "full"], "join", "how", how) @@ -636,6 +1076,10 @@ def inner_join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Alias for the :doc:`pydiverse.transform.join` verb with ``how="inner"``. + """ + return left >> join(right, on, "inner", validate=validate, suffix=suffix) @@ -658,6 +1102,10 @@ def left_join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Alias for the :doc:`pydiverse.transform.join` verb with ``how="left"``. + """ + return left >> join(right, on, "left", validate=validate, suffix=suffix) @@ -680,6 +1128,10 @@ def full_join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Alias for the :doc:`pydiverse.transform.join` verb with ``how="full"``. + """ + return left >> join(right, on, "full", validate=validate, suffix=suffix) @@ -689,6 +1141,9 @@ def show() -> Pipeable: ... @verb def show(table: Table) -> Pipeable: + """ + Prints the table to stdout. + """ print(table) return table @@ -740,26 +1195,3 @@ def preprocess_arg(arg: Any, table: Table, *, update_partition_by: bool = True) expr.context_kwargs["partition_by"] = table._cache.partition_by return resolve_C_columns(arg, table) - - -def get_backend(nd: AstNode) -> type[TableImpl]: - if isinstance(nd, Verb): - return get_backend(nd.child) - assert isinstance(nd, TableImpl) and nd is not TableImpl - return nd.__class__ - - -def from_ast(root: AstNode) -> Table: - if isinstance(root, TableImpl): - return Table(root) - - assert isinstance(root, Verb) - child = from_ast(root.child) - if isinstance(root, Alias): - return child >> alias(root.name) - - child._cache.update( - root, from_ast(root.right)._cache if isinstance(root, Join) else None - ) - child._ast = root - return child diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 226096d..42fecb7 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -12,6 +12,10 @@ from typing import Any, Generic, TypeVar, overload from uuid import UUID +import pandas as pd +import polars as pl + +from pydiverse.transform._internal.backend.targets import Pandas, Polars, Target from pydiverse.transform._internal.errors import FunctionTypeError from pydiverse.transform._internal.ops import ops, signature from pydiverse.transform._internal.ops.op import Ftype, Operator @@ -136,6 +140,24 @@ def __rshift__(self, rhs): ) return rhs(self) + def rank( + self: ColExpr, + partition_by: Col | ColName | Iterable[Col | ColName] | None = None, + ) -> ColExpr[Int]: + """ + Alias for :doc:`/reference/operators/_generated/pydiverse.transform.rank`. + """ + return ColFn(ops.rank, partition_by=partition_by, arrange=self) + + def dense_rank( + self: ColExpr, + partition_by: Col | ColName | Iterable[Col | ColName] | None = None, + ) -> ColExpr[Int]: + """ + Alias for :doc:`/reference/operators/_generated/pydiverse.transform.dense_rank`. + """ + return ColFn(ops.dense_rank, partition_by=partition_by, arrange=self) + # --- generated code starts here, do not delete this comment --- @overload @@ -217,7 +239,12 @@ def any( return ColFn(ops.any, self, partition_by=partition_by, filter=filter) def ascending(self: ColExpr) -> ColExpr: - """""" + """ + The default ordering. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.ascending, self) @@ -280,7 +307,12 @@ def count( return ColFn(ops.count, self, partition_by=partition_by, filter=filter) def descending(self: ColExpr) -> ColExpr: - """""" + """ + Reverses the default ordering. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.descending, self) @@ -311,12 +343,80 @@ def floor(self: ColExpr) -> ColExpr: return ColFn(ops.floor, self) def __floordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Integer division. + + Warning + ------- + The behavior of this operator differs from polars and python. Polars and python + always round towards negative infinity, whereas pydiverse.transform always + rounds towards zero, regardless of the sign. This behavior matches the one of C, + C++ and all currently supported SQL backends. + + See also + -------- + __mod__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a // t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 9 │ + │ -65 ┆ 7 ┆ -9 │ + │ 65 ┆ -7 ┆ -9 │ + │ -65 ┆ -7 ┆ 9 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.floordiv, self, rhs) def __rfloordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Integer division. + + Warning + ------- + The behavior of this operator differs from polars and python. Polars and python + always round towards negative infinity, whereas pydiverse.transform always + rounds towards zero, regardless of the sign. This behavior matches the one of C, + C++ and all currently supported SQL backends. + + See also + -------- + __mod__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a // t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 9 │ + │ -65 ┆ 7 ┆ -9 │ + │ 65 ┆ -7 ┆ -9 │ + │ -65 ┆ -7 ┆ 9 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.floordiv, rhs, self) @@ -605,12 +705,82 @@ def min( return ColFn(ops.min, self, partition_by=partition_by, filter=filter) def __mod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Computes the remainder of integer division. + + Warning + ------- + This operator behaves differently than in polars. There are at least two + conventions how `%` and :doc:`// ` + should behave for negative inputs. We follow the one that C, C++ and all + currently supported SQL backends follow. This means that the output has the same + sign as the left hand side of the input, regardless of the right hand side. + + See also + -------- + __floordiv__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a % t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 2 │ + │ -65 ┆ 7 ┆ -2 │ + │ 65 ┆ -7 ┆ 2 │ + │ -65 ┆ -7 ┆ -2 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.mod, self, rhs) def __rmod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Computes the remainder of integer division. + + Warning + ------- + This operator behaves differently than in polars. There are at least two + conventions how `%` and :doc:`// ` + should behave for negative inputs. We follow the one that C, C++ and all + currently supported SQL backends follow. This means that the output has the same + sign as the left hand side of the input, regardless of the right hand side. + + See also + -------- + __floordiv__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a % t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 2 │ + │ -65 ┆ 7 ┆ -2 │ + │ 65 ┆ -7 ┆ 2 │ + │ -65 ┆ -7 ┆ -2 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.mod, rhs, self) @@ -662,12 +832,36 @@ def __ne__(self: ColExpr, rhs: ColExpr) -> ColExpr[Bool]: return ColFn(ops.not_equal, self, rhs) def nulls_first(self: ColExpr) -> ColExpr: - """""" + """ + Specifies that nulls are placed at the beginning of the ordering. + + This does not mean that nulls are considered to be `less` than any other + element. I.e. if both `nulls_first` and `descending` are given, nulls will still + be placed at the beginning. + + If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is + backend-dependent. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.nulls_first, self) def nulls_last(self: ColExpr) -> ColExpr: - """""" + """ + Specifies that nulls are placed at the end of the ordering. + + This does not mean that nulls are considered to be `greater` than any other + element. I.e. if both `nulls_last` and `descending` are given, nulls will still + be placed at the end. + + If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is + backend-dependent. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.nulls_last, self) @@ -733,7 +927,7 @@ def shift( fill_value: ColExpr = None, *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr: """""" @@ -1087,6 +1281,54 @@ def __str__(self) -> str: def __hash__(self) -> int: return hash(self._uuid) + def export(self, target: Target) -> Any: + """ + Exports a column to a polars or pandas Series. + + :param target: + The data frame library to export to. Can be a ``Polars`` or ``Pandas`` + object. The ``lazy`` kwarg for polars is ignored. + + :return: + A polars or pandas Series. + + Examples + -------- + >>> t1 = pdt.Table({"h": [2.465, 0.22, -4.477, 10.8, -81.2, 0.0]}) + >>> t1.h.export(Polars()) + shape: (6,) + Series: 'h' [f64] + [ + 2.465 + 0.22 + -4.477 + 10.8 + -81.2 + 0.0 + ] + >>> t1.h.export(Pandas()) + 0 2.465 + 1 0.22 + 2 -4.477 + 3 10.8 + 4 -81.2 + 5 0.0 + Name: h, dtype: double[pyarrow] + """ + + from pydiverse.transform._internal.backend.table_impl import get_backend + from pydiverse.transform._internal.tree.verbs import Select + + ast = Select(self._ast, [self]) + df = get_backend(self._ast).export(ast, target, [self], {}) + if isinstance(target, Polars): + if isinstance(df, pl.LazyFrame): + df = df.collect() + return df.get_column(self.name) + else: + assert isinstance(target, Pandas) + return pd.Series(df[self.name]) + class ColName(ColExpr): def __init__( diff --git a/src/pydiverse/transform/_internal/tree/types.py b/src/pydiverse/transform/_internal/tree/types.py index 7f6d6d7..ba94124 100644 --- a/src/pydiverse/transform/_internal/tree/types.py +++ b/src/pydiverse/transform/_internal/tree/types.py @@ -2,10 +2,13 @@ import datetime import inspect -import warnings class Dtype: + """ + Base class for all data types. + """ + __slots__ = ("const",) def __init__(self, *, const: bool = False): @@ -16,14 +19,6 @@ def __eq__(self, rhs: Dtype | type[Dtype] | None) -> bool: return False if inspect.isclass(rhs) and issubclass(rhs, Dtype): rhs = rhs() - if is_supertype(rhs.without_const()) and not is_subtype(rhs.without_const()): - warnings.warn( - f"comparing to the supertype `{rhs}` for equality may yield unexpected " - "results.\n" - "hint: `==` is very strict, e.g. Int16() == Int() is False. Use the " - "`<=` operator to also match on subtypes (so Int16() <= Int() would " - "return True)." - ) if isinstance(rhs, Dtype): return self.const == rhs.const and type(self) is type(rhs) @@ -46,9 +41,15 @@ def __repr__(self) -> str: return ("const " if self.const else "") + self.__class__.__name__ def with_const(self) -> Dtype: + """ + Adds a `const` modifier from the data type. + """ return type(self)(const=True) def without_const(self) -> Dtype: + """ + Removes a `const` modifier from the data type (if present). + """ return type(self)() def converts_to(self, target: Dtype) -> bool: diff --git a/src/pydiverse/transform/types.py b/src/pydiverse/transform/types.py index 4e2e7b5..8f2383c 100644 --- a/src/pydiverse/transform/types.py +++ b/src/pydiverse/transform/types.py @@ -5,6 +5,7 @@ Date, Datetime, Decimal, + Dtype, Float, Float32, Float64, @@ -21,6 +22,7 @@ ) __all__ = [ + "Dtype", "Bool", "Date", "Datetime", diff --git a/tests/test_core.py b/tests/test_core.py index f7b4d52..77d2653 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2,20 +2,21 @@ import polars as pl import pytest +from polars.testing import assert_series_equal -from pydiverse.transform import C, Table -from pydiverse.transform._internal.pipe.pipeable import Pipeable, inverse_partial, verb -from pydiverse.transform._internal.pipe.verbs import mutate, select +import pydiverse.transform as pdt +from pydiverse.transform._internal.pipe.pipeable import Pipeable, inverse_partial +from pydiverse.transform.common import * @pytest.fixture def tbl1(): - return Table(pl.DataFrame({"col1": [0.1], "col2": [3.14]})) + return pdt.Table(pl.DataFrame({"col1": [0.1], "col2": [3.14]})) @pytest.fixture def tbl2(): - return Table(pl.DataFrame({"col1": [1], "col2": [2], "col3": [3]})) + return pdt.Table(pl.DataFrame({"col1": [1], "col2": [2], "col3": [3]})) class TestTable: @@ -59,6 +60,14 @@ def test_contains(self, tbl1, tbl2): assert all(col in tbl1 for col in tbl1) assert all(col in tbl2 for col in tbl2) + def test_dict_constructor(self): + t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) + assert_series_equal( + (t >> mutate(r=pdt.rank(arrange=t.a))).r.export(Polars()), + pl.Series([3, 1, 4, 1, 6, 7, 4]), + check_names=False, + ) + class TestDispatchers: def test_inverse_partial(self): diff --git a/tests/test_polars_table.py b/tests/test_polars_table.py index 6f76132..fb0a709 100644 --- a/tests/test_polars_table.py +++ b/tests/test_polars_table.py @@ -580,20 +580,21 @@ def test_duckdb_execution(self, tbl2, tbl3): check_row_order=False, ) - def test_col_export(self, tbl1, tbl2): - assert_equal(df1.get_column("col1"), tbl1.col1 >> export(Polars())) + def test_col_export(self, tbl1: pdt.Table, tbl2: pdt.Table): + assert_equal(df1.get_column("col1"), tbl1.col1.export(Polars())) t = tbl2 >> mutate(u=tbl2.col1 * tbl2.col2, v=-tbl2.col3) t_ex: pl.DataFrame = t >> export(Polars(lazy=False)) assert_equal( (t_ex["u"] + t_ex["col2"]).exp() - t_ex["v"], - ((t.u + C.col2).exp() - t.v) >> export(Polars()), + (t >> mutate(h=(t.u + C.col2).exp() - t.v)).h.export(Polars()), ) e = t >> inner_join( tbl1, tbl1.col1.cast(pdt.Float64()) <= tbl2.col1 + tbl2.col3 ) e_ex = e >> export(Polars(lazy=False)) assert_equal( - e_ex["col2"] + e_ex["col1_df1"], (e.col2 + tbl1.col1) >> export(Polars()) + e_ex["col2"] + e_ex["col1_df1"], + (e >> mutate(j=e.col2 + tbl1.col1)).j.export(Polars()), )