Skip to content

Commit

Permalink
Add type stubs for pylibcudf (#17258)
Browse files Browse the repository at this point in the history
Having looked at a bunch of the automation options, I just did it by hand.

A followup will add some automation to add docstrings (so we can see those via LSP integration in editors) and do some simple validation.

- Closes #15190

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: #17258
  • Loading branch information
wence- authored Nov 12, 2024
1 parent ccfc95a commit 7682edb
Show file tree
Hide file tree
Showing 206 changed files with 2,863 additions and 228 deletions.
73 changes: 66 additions & 7 deletions docs/cudf/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,18 @@
import tempfile
import warnings
import xml.etree.ElementTree as ET
from enum import IntEnum
from typing import Any

import cudf
from docutils.nodes import Text
from packaging.version import Version
from sphinx.addnodes import pending_xref
from sphinx.highlighting import lexers
from sphinx.ext import intersphinx
from pygments.lexer import RegexLexer
from pygments.token import Text as PText

import cudf
from sphinx.addnodes import pending_xref
from sphinx.ext import intersphinx
from sphinx.ext.autodoc import ClassDocumenter, bool_option
from sphinx.highlighting import lexers


class PseudoLexer(RegexLexer):
Expand Down Expand Up @@ -342,7 +344,10 @@ def clean_all_xml_files(path):
"cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
"cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
"cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
"DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
"DeviceBuffer": (
"rmm.pylibrmm.device_buffer.DeviceBuffer",
"rmm.DeviceBuffer",
),
}


Expand Down Expand Up @@ -373,7 +378,14 @@ def _generate_namespaces(namespaces):
_all_namespaces = _generate_namespaces(
{
# Note that io::datasource is actually a nested class
"cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
"cudf": {
"io",
"io::datasource",
"strings",
"ast",
"ast::expression",
"io::text",
},
"numeric": {},
"nvtext": {},
}
Expand Down Expand Up @@ -642,13 +654,60 @@ def linkcode_resolve(domain, info) -> str | None:
f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
)


# Needed for avoid build warning for PandasCompat extension
suppress_warnings = ["myst.domains"]


class PLCIntEnumDocumenter(ClassDocumenter):
objtype = "enum"
directivetype = "attribute"
priority = 10 + ClassDocumenter.priority

option_spec = dict(ClassDocumenter.option_spec)

@classmethod
def can_document_member(
cls, member: Any, membername: str, isattr: bool, parent: Any
) -> bool:
try:
return issubclass(
member, IntEnum
) and member.__module__.startswith("pylibcudf")
except TypeError:
return False

def add_directive_header(self, sig: str) -> None:
self.directivetype = "attribute"
super().add_directive_header(sig)

def add_content(self, more_content) -> None:
doc_as_attr = self.doc_as_attr
self.doc_as_attr = False
super().add_content(more_content)
self.doc_as_attr = doc_as_attr
source_name = self.get_sourcename()
enum_object: IntEnum = self.object

if self.object.__name__ != "Kind":
self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name)
self.add_line("", source_name)
self.add_line("Enum members", source_name)
self.add_line("", source_name)

for the_member_name in enum_object.__members__: # type: ignore[attr-defined]
self.add_line(
f"* ``{the_member_name}``", source_name
)
self.add_line("", source_name)


def setup(app):
app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
app.add_js_file(
"https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
)
app.connect("doctree-read", resolve_aliases)
app.connect("missing-reference", on_missing_reference)
app.setup_extension("sphinx.ext.autodoc")
app.add_autodocumenter(PLCIntEnumDocumenter)
73 changes: 72 additions & 1 deletion docs/cudf/source/developer_guide/pylibcudf.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
- All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
- All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
- Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.

- Type stubs are provided and generated manually. When adding new
functionality, ensure that the matching type stub is appropriately updated.

## Relationship to libcudf

Expand Down Expand Up @@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with
and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.

Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.

### Type stubs

Since static type checkers like `mypy` and `pyright` cannot parse
Cython code, we provide type stubs for the pylibcudf package. These
are currently maintained manually, alongside the matching pylibcudf
files.

Every `pyx` file should have a matching `pyi` file that provides the
type stubs. Most functions can be exposed straightforwardly. Some
guiding principles:

- For typed integer arguments in libcudf, use `int` as a type
annotation.
- For functions which are annotated as a `list` in Cython, but the
function body does more detailed checking, try and encode the
detailed information in the type.
- For Cython fused types there are two options:
1. If the fused type appears only once in the function signature,
use a `Union` type;
2. If the fused type appears more than once (or as both an input
and output type), use a `TypeVar` with
the variants in the fused type provided as constraints.


As an example, `pylibcudf.copying.split` is typed in Cython as:

```cython
ctypedef fused ColumnOrTable:
Table
Column
cpdef list split(ColumnOrTable input, list splits): ...
```

Here we only have a single use of the fused type, and the `list`
arguments do not specify their values. Here, if we provide a `Column`
as input, we receive a `list[Column]` as output, and if we provide a
`Table` we receive `list[Table]` as output.

In the type stub, we can encode this with a `TypeVar`, we can also
provide typing for the `splits` argument that indicates that the split
values must be integers:

```python
ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)

def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
```

Conversely, `pylibcudf.copying.scatter` uses a fused type only once in
its input:

```cython
ctypedef fused TableOrListOfScalars:
Table
list
cpdef Table scatter(
TableOrListOfScalars source, Column scatter_map, Table target
)
```

In the type stub, we can use a normal union in this case

```python
def scatter(
source: Table | list[Scalar], scatter_map: Column, target: Table
) -> Table: ...
```
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/labeling.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive,
plc_column = plc.labeling.label_bins(
input.to_pylibcudf(mode="read"),
left_edges.to_pylibcudf(mode="read"),
left_inclusive,
plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
right_edges.to_pylibcudf(mode="read"),
right_inclusive
plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
)
return Column.from_pylibcudf(plc_column)
24 changes: 12 additions & 12 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool

from pylibcudf.libcudf.types cimport size_type
from pylibcudf.libcudf.types cimport (
nan_equality, null_equality, null_order, order, size_type
)

from cudf._lib.column cimport Column
from cudf._lib.utils cimport columns_from_pylibcudf_table
Expand Down Expand Up @@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
return Column.from_pylibcudf(
plc.lists.distinct(
col.to_pylibcudf(mode="read"),
nulls_equal,
nans_all_equal,
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL,
nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL,
)
)

Expand All @@ -48,12 +50,8 @@ def sort_lists(Column col, bool ascending, str na_position):
return Column.from_pylibcudf(
plc.lists.sort_lists(
col.to_pylibcudf(mode="read"),
ascending,
(
plc.types.NullOrder.BEFORE
if na_position == "first"
else plc.types.NullOrder.AFTER
),
order.ASCENDING if ascending else order.DESCENDING,
null_order.BEFORE if na_position == "first" else null_order.AFTER,
False,
)
)
Expand Down Expand Up @@ -95,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key):
plc.lists.index_of(
col.to_pylibcudf(mode="read"),
<Scalar> py_search_key.device_value.c_value,
True,
plc.lists.DuplicateFindOption.FIND_FIRST,
)
)

Expand All @@ -106,7 +104,7 @@ def index_of_column(Column col, Column search_keys):
plc.lists.index_of(
col.to_pylibcudf(mode="read"),
search_keys.to_pylibcudf(mode="read"),
True,
plc.lists.DuplicateFindOption.FIND_FIRST,
)
)

Expand All @@ -127,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False):
return Column.from_pylibcudf(
plc.lists.concatenate_list_elements(
input_column.to_pylibcudf(mode="read"),
dropna,
plc.lists.ConcatenateNullPolicy.IGNORE
if dropna
else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
)
)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/cudf_polars/containers/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame:
# To guarantee we produce correct names, we therefore
# serialise with names we control and rename with that map.
name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
table: pa.Table = plc.interop.to_arrow(
table = plc.interop.to_arrow(
self.table,
[plc.interop.ColumnMetadata(name=name) for name in name_map],
)
Expand Down
4 changes: 3 additions & 1 deletion python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@

class TemporalFunction(Expr):
__slots__ = ("name", "options")
_COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
_COMPONENT_MAP: ClassVar[
dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent]
] = {
pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/cudf_polars/dsl/expressions/literal.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
class LiteralColumn(Expr):
__slots__ = ("value",)
_non_child = ("dtype", "value")
value: pa.Array[Any, Any]
value: pa.Array[Any]

def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
self.dtype = dtype
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def do_evaluate(
# Mask must have been applied.
return df
elif typ == "ndjson":
json_schema: list[tuple[str, str, list]] = [
json_schema: list[plc.io.json.NameAndType] = [
(name, typ, []) for name, typ in schema.items()
]
plc_tbl_w_meta = plc.io.json.read_json(
Expand Down
Loading

0 comments on commit 7682edb

Please sign in to comment.