Add type stubs for pylibcudf (#17258)

Having looked at a bunch of the automation options, I just did it by hand. A followup will add some automation to add docstrings (so we can see those via LSP integration in editors) and do some simple validation. - Closes #15190 Authors: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Matthew Murray (https://github.com/Matt711) URL: #17258
rapidsai · Nov 12, 2024 · 7682edb · 7682edb
1 parent ccfc95a
commit 7682edb
Show file tree

Hide file tree

Showing 206 changed files with 2,863 additions and 228 deletions.
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
@@ -26,16 +26,18 @@
 import tempfile
 import warnings
 import xml.etree.ElementTree as ET
+from enum import IntEnum
+from typing import Any
 
+import cudf
 from docutils.nodes import Text
 from packaging.version import Version
-from sphinx.addnodes import pending_xref
-from sphinx.highlighting import lexers
-from sphinx.ext import intersphinx
 from pygments.lexer import RegexLexer
 from pygments.token import Text as PText
-
-import cudf
+from sphinx.addnodes import pending_xref
+from sphinx.ext import intersphinx
+from sphinx.ext.autodoc import ClassDocumenter, bool_option
+from sphinx.highlighting import lexers
 
 
 class PseudoLexer(RegexLexer):
@@ -342,7 +344,10 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": (
+        "rmm.pylibrmm.device_buffer.DeviceBuffer",
+        "rmm.DeviceBuffer",
+    ),
 }
 
 
@@ -373,7 +378,14 @@ def _generate_namespaces(namespaces):
 _all_namespaces = _generate_namespaces(
     {
         # Note that io::datasource is actually a nested class
-        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
+        "cudf": {
+            "io",
+            "io::datasource",
+            "strings",
+            "ast",
+            "ast::expression",
+            "io::text",
+        },
         "numeric": {},
         "nvtext": {},
     }
@@ -642,13 +654,60 @@ def linkcode_resolve(domain, info) -> str | None:
         f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
     )
 
+
 # Needed for avoid build warning for PandasCompat extension
 suppress_warnings = ["myst.domains"]
 
+
+class PLCIntEnumDocumenter(ClassDocumenter):
+    objtype = "enum"
+    directivetype = "attribute"
+    priority = 10 + ClassDocumenter.priority
+
+    option_spec = dict(ClassDocumenter.option_spec)
+
+    @classmethod
+    def can_document_member(
+        cls, member: Any, membername: str, isattr: bool, parent: Any
+    ) -> bool:
+        try:
+            return issubclass(
+                member, IntEnum
+            ) and member.__module__.startswith("pylibcudf")
+        except TypeError:
+            return False
+
+    def add_directive_header(self, sig: str) -> None:
+        self.directivetype = "attribute"
+        super().add_directive_header(sig)
+
+    def add_content(self, more_content) -> None:
+        doc_as_attr = self.doc_as_attr
+        self.doc_as_attr = False
+        super().add_content(more_content)
+        self.doc_as_attr = doc_as_attr
+        source_name = self.get_sourcename()
+        enum_object: IntEnum = self.object
+
+        if self.object.__name__ != "Kind":
+            self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name)
+        self.add_line("", source_name)
+        self.add_line("Enum members", source_name)
+        self.add_line("", source_name)
+
+        for the_member_name in enum_object.__members__:  # type: ignore[attr-defined]
+            self.add_line(
+                f"* ``{the_member_name}``", source_name
+            )
+            self.add_line("", source_name)
+
+
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file(
         "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
     )
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", on_missing_reference)
+    app.setup_extension("sphinx.ext.autodoc")
+    app.add_autodocumenter(PLCIntEnumDocumenter)
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
 - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
 - All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
 - Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.
-
+- Type stubs are provided and generated manually. When adding new
+  functionality, ensure that the matching type stub is appropriately updated.
 
 ## Relationship to libcudf
 
@@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with
 and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.
 
 Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.
+
+### Type stubs
+
+Since static type checkers like `mypy` and `pyright` cannot parse
+Cython code, we provide type stubs for the pylibcudf package. These
+are currently maintained manually, alongside the matching pylibcudf
+files.
+
+Every `pyx` file should have a matching `pyi` file that provides the
+type stubs. Most functions can be exposed straightforwardly. Some
+guiding principles:
+
+- For typed integer arguments in libcudf, use `int` as a type
+  annotation.
+- For functions which are annotated as a `list` in Cython, but the
+  function body does more detailed checking, try and encode the
+  detailed information in the type.
+- For Cython fused types there are two options:
+    1. If the fused type appears only once in the function signature,
+       use a `Union` type;
+    2. If the fused type appears more than once (or as both an input
+       and output type), use a `TypeVar` with
+       the variants in the fused type provided as constraints.
+
+
+As an example, `pylibcudf.copying.split` is typed in Cython as:
+
+```cython
+ctypedef fused ColumnOrTable:
+    Table
+    Column
+
+cpdef list split(ColumnOrTable input, list splits): ...
+```
+
+Here we only have a single use of the fused type, and the `list`
+arguments do not specify their values. Here, if we provide a `Column`
+as input, we receive a `list[Column]` as output, and if we provide a
+`Table` we receive `list[Table]` as output.
+
+In the type stub, we can encode this with a `TypeVar`, we can also
+provide typing for the `splits` argument that indicates that the split
+values must be integers:
+
+```python
+ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)
+
+def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
+```
+
+Conversely, `pylibcudf.copying.scatter` uses a fused type only once in
+its input:
+
+```cython
+ctypedef fused TableOrListOfScalars:
+    Table
+    list
+
+cpdef Table scatter(
+    TableOrListOfScalars source, Column scatter_map, Table target
+)
+```
+
+In the type stub, we can use a normal union in this case
+
+```python
+def scatter(
+    source: Table | list[Scalar], scatter_map: Column, target: Table
+) -> Table: ...
+```
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
@@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive,
     plc_column = plc.labeling.label_bins(
         input.to_pylibcudf(mode="read"),
         left_edges.to_pylibcudf(mode="read"),
-        left_inclusive,
+        plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
         right_edges.to_pylibcudf(mode="read"),
-        right_inclusive
+        plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
     )
     return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
@@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport (
+    nan_equality, null_equality, null_order, order, size_type
+)
 
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
@@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
         plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
-            nulls_equal,
-            nans_all_equal,
+            null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL,
+            nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL,
         )
     )
 
@@ -48,12 +50,8 @@ def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
         plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
-            ascending,
-            (
-                plc.types.NullOrder.BEFORE
-                if na_position == "first"
-                else plc.types.NullOrder.AFTER
-            ),
+            order.ASCENDING if ascending else order.DESCENDING,
+            null_order.BEFORE if na_position == "first" else null_order.AFTER,
             False,
         )
     )
@@ -95,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key):
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
-            True,
+            plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
 
@@ -106,7 +104,7 @@ def index_of_column(Column col, Column search_keys):
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
-            True,
+            plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
 
@@ -127,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
         plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
-            dropna,
+            plc.lists.ConcatenateNullPolicy.IGNORE
+            if dropna
+            else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
         )
     )
 

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame:
         # To guarantee we produce correct names, we therefore
         # serialise with names we control and rename with that map.
         name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
-        table: pa.Table = plc.interop.to_arrow(
+        table = plc.interop.to_arrow(
             self.table,
             [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )

diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -27,7 +27,9 @@
 
 class TemporalFunction(Expr):
     __slots__ = ("name", "options")
-    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+    _COMPONENT_MAP: ClassVar[
+        dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent]
+    ] = {
         pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
         pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
         pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,

diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class LiteralColumn(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
-    value: pa.Array[Any, Any]
+    value: pa.Array[Any]
 
     def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         self.dtype = dtype

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -517,7 +517,7 @@ def do_evaluate(
                 # Mask must have been applied.
                 return df
         elif typ == "ndjson":
-            json_schema: list[tuple[str, str, list]] = [
+            json_schema: list[plc.io.json.NameAndType] = [
                 (name, typ, []) for name, typ in schema.items()
             ]
             plc_tbl_w_meta = plc.io.json.read_json(