From d1d44209673f3b789bcbc8dc628231d4ace76bef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:24:49 -0800 Subject: [PATCH] Move cudf._lib.search to cudf.core._internals (#17411) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17411 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/search.pyx | 68 ---------------------- python/cudf/cudf/core/_internals/search.py | 56 ++++++++++++++++++ python/cudf/cudf/core/column/column.py | 23 +++++++- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical.py | 4 +- python/cudf/cudf/core/column/string.py | 10 +--- python/cudf/cudf/core/frame.py | 8 ++- python/cudf/cudf/core/index.py | 2 +- 10 files changed, 87 insertions(+), 88 deletions(-) delete mode 100644 python/cudf/cudf/_lib/search.pyx create mode 100644 python/cudf/cudf/core/_internals/search.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8a521f19350..1c2b24d2391 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -36,7 +36,6 @@ set(cython_sources rolling.pyx round.pyx scalar.pyx - search.pyx sort.pyx stream_compaction.pyx string_casting.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 27bb486f55b..13d05033c11 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -22,7 +22,6 @@ reshape, rolling, round, - search, sort, stream_compaction, string_casting, diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx deleted file mode 100644 index 8108361052b..00000000000 --- a/python/cudf/cudf/_lib/search.pyx +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - - -@acquire_spill_lock() -def search_sorted( - list source, list values, side, ascending=True, na_position="last" -): - """Find indices where elements should be inserted to maintain order - - Parameters - ---------- - source : list of columns - List of columns to search in - values : List of columns - List of value columns to search for - side : str {'left', 'right'} optional - If 'left', the index of the first suitable location is given. - If 'right', return the last such index - """ - # Note: We are ignoring index columns here - column_order = [ - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ] * len(source) - null_precedence = [ - pylibcudf.types.NullOrder.AFTER - if na_position == "last" - else pylibcudf.types.NullOrder.BEFORE - ] * len(source) - - func = getattr( - pylibcudf.search, - "lower_bound" if side == "left" else "upper_bound", - ) - return Column.from_pylibcudf( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - column_order, - null_precedence, - ) - ) - - -@acquire_spill_lock() -def contains(Column haystack, Column needles): - """Check whether column contains multiple values - - Parameters - ---------- - column : NumericalColumn - Column to search in - needles : - A column of values to search for - """ - return Column.from_pylibcudf( - pylibcudf.search.contains( - haystack.to_pylibcudf(mode="read"), - needles.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py new file mode 100644 index 00000000000..a0ffe078de9 --- /dev/null +++ b/python/cudf/cudf/core/_internals/search.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def search_sorted( + source: list[ColumnBase], + values: list[ColumnBase], + side: Literal["left", "right"], + ascending: bool = True, + na_position: Literal["first", "last"] = "last", +) -> ColumnBase: + """Find indices where elements should be inserted to maintain order + + Parameters + ---------- + source : list of columns + List of columns to search in + values : List of columns + List of value columns to search for + side : str {'left', 'right'} optional + If 'left', the index of the first suitable location is given. + If 'right', return the last such index + """ + # Note: We are ignoring index columns here + column_order = [ + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING + ] * len(source) + null_precedence = [ + plc.types.NullOrder.AFTER + if na_position == "last" + else plc.types.NullOrder.BEFORE + ] * len(source) + + func = getattr( + plc.search, + "lower_bound" if side == "left" else "upper_bound", + ) + return Column.from_pylibcudf( + func( + plc.Table([col.to_pylibcudf(mode="read") for col in source]), + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + column_order, + null_precedence, + ) + ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f6eaea4b783..53946be1c49 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -757,7 +757,7 @@ def indices_of( raise ValueError("value must be a scalar") else: value = as_column(value, dtype=self.dtype, length=1) - mask = libcudf.search.contains(value, self) + mask = value.contains(self) return apply_boolean_mask( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] @@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # self.isin(other) asks "which values of self are in other" # contains(haystack, needles) asks "which needles are in haystack" # hence this argument ordering. - result = libcudf.search.contains(rhs, self) + result = rhs.contains(self) if self.null_count > 0: # If one of the needles is null, then the result contains # nulls, these nulls should be replaced by whether or not the @@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool: [self], [False], None ) + def contains(self, other: ColumnBase) -> ColumnBase: + """ + Check whether column contains multiple values. + + Parameters + ---------- + other : Column + A column of values to search for + """ + with acquire_spill_lock(): + return Column.from_pylibcudf( + plc.search.contains( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + ) + ) + def sort_values( self: Self, ascending: bool = True, @@ -1190,7 +1207,7 @@ def searchsorted( raise ValueError( "Column searchsorted expects values to be column of same dtype" ) - return libcudf.search.search_sorted( + return cudf.core._internals.search.search_sorted( # type: ignore[return-value] [self], [value], side=side, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 16124cf0a7d..b40ea4eedd3 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,9 +18,9 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.search import search_sorted from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import unary +from cudf.core._internals.search import search_sorted from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, get_compatible_timezone, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 36d1bdb45b6..a7538c1c947 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool: except (TypeError, ValueError): return False # TODO: Use `scalar`-based `contains` wrapper - return libcudf.search.contains( - self, column.as_column([search_item], dtype=self.dtype) + return self.contains( + column.as_column([search_item], dtype=self.dtype) ).any() def indices_of(self, value: ScalarLike) -> NumericalColumn: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 3d70b01b7e4..a9ab2d373fd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5857,14 +5857,8 @@ def sum( return result_col def __contains__(self, item: ScalarLike) -> bool: - if is_scalar(item): - return True in libcudf.search.contains( - self, column.as_column([item], dtype=self.dtype) - ) - else: - return True in libcudf.search.contains( - self, column.as_column(item, dtype=self.dtype) - ) + other = [item] if is_scalar(item) else item + return self.contains(column.as_column(other, dtype=self.dtype)).any() def as_numerical_column( self, dtype: Dtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 30868924bcd..838fde260df 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -8,8 +8,6 @@ from collections import abc from typing import TYPE_CHECKING, Any, Literal -# TODO: The `numpy` import is needed for typing purposes during doc builds -# only, need to figure out why the `np` alias is insufficient then remove. import cupy import numpy import numpy as np @@ -19,9 +17,13 @@ import pylibcudf as plc import cudf + +# TODO: The `numpy` import is needed for typing purposes during doc builds +# only, need to figure out why the `np` alias is insufficient then remove. from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals.search import search_sorted from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -1302,7 +1304,7 @@ def searchsorted( for val, common_dtype in zip(values, common_dtype_list) ] - outcol = libcudf.search.search_sorted( + outcol = search_sorted( sources, values, side, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1b90e9f9df0..0a2b15a16b9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.filling import sequence -from cudf._lib.search import search_sorted from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( @@ -32,6 +31,7 @@ ) from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals.search import search_sorted from cudf.core.column import ( CategoricalColumn, ColumnBase,