From 7d80ee40fdd0f07589b7cff12a207e991eec92fc Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 24 May 2024 16:19:01 -0700 Subject: [PATCH 01/10] Migrate quantile.pxd to pylibcudf --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/quantiles.rst | 6 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/quantiles.pxd | 23 +++ python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 145 ++++++++++++++++++ python/cudf/cudf/_lib/pylibcudf/types.pxd | 8 + python/cudf/cudf/_lib/quantiles.pyx | 105 ++++--------- .../cudf/cudf/core/column/numerical_base.py | 3 + python/cudf/cudf/core/frame.py | 2 + .../cudf/cudf/pylibcudf_tests/common/utils.py | 6 + python/cudf/cudf/pylibcudf_tests/conftest.py | 29 ++++ .../cudf/pylibcudf_tests/test_quantiles.py | 141 +++++++++++++++++ 14 files changed, 395 insertions(+), 79 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_quantiles.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 8cad95f61ae..57f487c6921 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf. join lists merge + quantiles reduce rolling scalar diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst new file mode 100644 index 00000000000..3417c1ff59d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst @@ -0,0 +1,6 @@ +========= +quantiles +========= + +.. automodule:: cudf._lib.pylibcudf.quantiles + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index efc978fc6d0..8de56dd6f26 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -25,6 +25,7 @@ set(cython_sources join.pyx lists.pyx merge.pyx + quantiles.pyx reduce.pyx replace.pyx rolling.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5adefa5fd93..4ca60be4cac 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -11,6 +11,7 @@ from . cimport ( join, lists, merge, + quantiles, reduce, replace, rolling, @@ -44,6 +45,7 @@ __all__ = [ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 89f874f5fa5..4e4ebe68167 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -11,6 +11,7 @@ join, lists, merge, + quantiles, reduce, replace, rolling, @@ -44,6 +45,7 @@ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd new file mode 100644 index 00000000000..aae96c3f0e0 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table +from .types cimport interpolation, sorted + + +cpdef Column quantile( + Column input, + const double[:] q, + interpolation interp = *, + Column ordered_indices = *, + bint exact = * +) + +cpdef Table quantiles( + Table input, + const double[:] q, + interpolation interp = *, + sorted is_input_sorted = *, + list column_order = *, + list null_precedence = *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx new file mode 100644 index 00000000000..6584c9fc9de --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -0,0 +1,145 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view +from cudf._lib.pylibcudf.libcudf.quantiles cimport ( + quantile as cpp_quantile, + quantiles as cpp_quantiles, +) +from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.types cimport order, sorted + +from .column cimport Column +from .table cimport Table +from .types cimport interpolation + + +cpdef Column quantile( + Column input, + const double[:] q, + interpolation interp = interpolation.LINEAR, + Column ordered_indices = None, + bool exact=True +): + """Computes quantiles with interpolation. + + Computes the specified quantiles by interpolating values between which they lie, + using the interpolation strategy specified in interp. + + Parameters + ---------- + q: array-like that implements buffer-protocol + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.LINEAR + The strategy used to select between values adjacent to a specified quantile. + ordered_indices: Column, default empty column + The column containing the sorted order of input. + + If empty, all input values are used in existing order. + Indices must be in range [0, input.size()), but are not required to be unique. + Values not indexed by this column will be ignored. + exact: bool, default True + Returns doubles if True. Otherwise, returns same type as input + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[column] c_result + vector[double] q_vec + column_view ordered_indices_view + + if ordered_indices is None: + ordered_indices_view = column_view() + else: + ordered_indices_view = ordered_indices.view() + + # Copy from memoryview into vector + if len(q) > 0: + q_vec.assign(&q[0], &q[0] + len(q)) + + with nogil: + c_result = move( + cpp_quantile( + input.view(), + q_vec, + interp, + ordered_indices_view, + exact, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table quantiles( + Table input, + const double[:] q, + interpolation interp = interpolation.NEAREST, + sorted is_input_sorted = sorted.NO, + # cython-lint complains that this a dangerous default value but + # we don't modify these parameters, and so should be good to go + list column_order = [], # no-cython-lint + list null_precedence = [], # no-cython-lint +): + """Computes row quantiles with interpolation. + + Computes the specified quantiles by retrieving the row corresponding to the + specified quantiles. In the event a quantile lies in between rows, the specified + interpolation strategy is used to pick between the rows. + + Parameters + ---------- + q: array-like that implements buffer-protocol + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.LINEAR + The strategy used to select between values adjacent to a specified quantile. + + Must be a non-arithmetic interpolation strategy + (i.e. one of + {`Interpolation.HIGHER`, `Interpolation.LOWER`, `Interpolation.NEAREST`}) + is_input_sorted: Sorted, default Sorted.NO + Whether the input table has been pre-sorted or not. + column_order: list, default [] + A list of `Order` enums, indicating the desired sort order for each column. + + Ignored if `is_input_sorted` is `Sorted.YES` + null_precedence: list, default [] + A list of `NullOrder` enums, indicating how nulls should be sorted. + + Ignored if `is_input_sorted` is `Sorted.YES` + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[table] c_result + vector[double] q_vec + vector[order] column_order_vec = column_order + vector[null_order] null_precedence_vec = null_precedence + + # Copy from memoryview into vector + q_vec.assign(&q[0], &q[0] + len(q)) + + with nogil: + c_result = move( + cpp_quantiles( + input.view(), + q_vec, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index e54a259819e..2e0eaa02d7f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -17,6 +17,14 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( type_id, ) +from cudf._lib.pylibcudf.libcudf.types import \ + interpolation as Interpolation # no-cython-lint +from cudf._lib.pylibcudf.libcudf.types import \ + null_order as NullOrder # no-cython-lint +from cudf._lib.pylibcudf.libcudf.types import \ + sorted as Sorted # no-cython-lint +from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint + cdef class DataType: cdef data_type c_obj diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 3d20454a7ce..bd250f2ab45 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -3,122 +3,69 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.types cimport ( underlying_type_t_interpolation, - underlying_type_t_null_order, - underlying_type_t_order, underlying_type_t_sorted, ) from cudf._lib.types import Interpolation -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.quantiles cimport ( - quantile as cpp_quantile, - quantiles as cpp_quantile_table, -) -from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view -from cudf._lib.pylibcudf.libcudf.types cimport ( - interpolation, - null_order, - order, - sorted, -) -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted +from cudf._lib.utils cimport columns_from_pylibcudf_table + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def quantile( Column input, - object q, + double[:] q, str interp, Column ordered_indices, bool exact, - ): - cdef column_view c_input = input.view() - cdef column_view c_ordered_indices = ( - column_view() if ordered_indices is None - else ordered_indices.view() - ) cdef interpolation c_interp = ( Interpolation[interp.upper()] ) - cdef bool c_exact = exact - cdef vector[double] c_q - c_q.reserve(len(q)) - - for value in q: - c_q.push_back(value) - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_quantile( - c_input, - c_q, - c_interp, - c_ordered_indices, - c_exact, - ) + return Column.from_pylibcudf( + plc.quantiles.quantile( + input.to_pylibcudf(mode="read"), + q, + c_interp, + ordered_indices.to_pylibcudf(mode="read"), + exact ) - - return Column.from_unique_ptr(move(c_result)) + ) def quantile_table( list source_columns, - vector[double] q, + double[:] q, object interp, object is_input_sorted, list column_order, list null_precedence, ): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef vector[double] c_q = q + cdef interpolation c_interp = ( interp ) cdef sorted c_is_input_sorted = ( is_input_sorted ) - cdef vector[order] c_column_order - cdef vector[null_order] c_null_precedence - - c_column_order.reserve(len(column_order)) - c_null_precedence.reserve(len(null_precedence)) - for value in column_order: - c_column_order.push_back( - ( value) + return columns_from_pylibcudf_table( + plc.quantiles.quantiles( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]), + q, + c_interp, + c_is_input_sorted, + column_order, + null_precedence ) - - for value in null_precedence: - c_null_precedence.push_back( - ( value) - ) - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_quantile_table( - c_input, - c_q, - c_interp, - c_is_input_sorted, - c_column_order, - c_null_precedence, - ) - ) - - return columns_from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 541c32a2520..ce1f1cdf521 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -116,6 +116,9 @@ def quantile( indices = libcudf.sort.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) + + q = np.asarray(q, dtype="float64") + result = libcudf.quantiles.quantile( self, q, interpolation, indices, exact ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b561906afb..d8b1fc11bc7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -833,6 +833,8 @@ def _quantile_table( libcudf.types.NullOrder[key] for key in null_precedence ] + q = np.asarray(q, dtype="float64") + return self._from_columns_like_self( libcudf.quantiles.quantile_table( [*self._columns], diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 6636ab9e5f8..d5f08ca25e3 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -36,6 +36,12 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None: if isinstance(pa_array, pa.ChunkedArray): pa_array = pa_array.combine_chunks() + print(plc_pa) + print(pa_array) + + print(plc_pa.type) + print(pa_array.type) + assert plc_pa.equals(pa_array) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 6d8284fb3db..60b9cb553f7 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -7,6 +7,8 @@ import pyarrow as pa import pytest +import cudf._lib.pylibcudf as plc + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) from utils import DEFAULT_STRUCT_TESTING_TYPE @@ -29,3 +31,30 @@ ) def pa_type(request): return request.param + + +@pytest.fixture( + scope="session", + params=[ + pa.int64(), + pa.float64(), + pa.uint64(), + ], +) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="session", params=[opt.value for opt in plc.types.Interpolation] +) +def interp_opt(request): + return request.param + + +@pytest.fixture( + scope="session", + params=[opt.value for opt in plc.types.Sorted], +) +def sorted_opt(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py new file mode 100644 index 00000000000..4a77b0b9d42 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -0,0 +1,141 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq, assert_table_eq + +import cudf._lib.pylibcudf as plc + +# Map pylibcudf interpolation options to pyarrow options +interp_mapping = { + plc.types.Interpolation.LINEAR: "linear", + plc.types.Interpolation.LOWER: "lower", + plc.types.Interpolation.HIGHER: "higher", + plc.types.Interpolation.MIDPOINT: "midpoint", + plc.types.Interpolation.NEAREST: "nearest", +} + + +@pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) +def pa_col_data(request, numeric_pa_type): + return pa.array(request.param, type=numeric_pa_type) + + +@pytest.fixture(scope="module") +def plc_col_data(pa_col_data): + return plc.interop.from_arrow(pa_col_data) + + +@pytest.fixture( + scope="module", + params=[ + { + "arrays": [[1, 2, 3, 5, 4], [5.0, 6.0, 8.0, 7.0, 9.0]], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.int64()), + ] + ), + }, + { + "arrays": [ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [1, 2.0, 2.2, 2.3, 2.4, None, None, 3.5, 4.5, 5.5], + ], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.float64()), + ] + ), + }, + ], +) +def plc_tbl_data(request): + return plc.interop.from_arrow(pa.Table.from_arrays(**request.param)) + + +@pytest.mark.parametrize("q", [[0], [0.5], [0.1, 0.5, 0.7, 0.9]]) +@pytest.mark.parametrize("exact", [True, False]) +def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): + q = np.asarray(q, dtype="float64") + + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, q, interp_opt, ordered_indices, exact + ) + + pa_interp_opt = interp_mapping[interp_opt] + + if exact: + pa_col_data = pc.cast(pa_col_data, pa.float64()) + + exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt) + + if not exact: + exp = pc.cast(exp, pa_col_data.type, safe=False) + + assert_column_eq(res, exp) + + +@pytest.mark.parametrize( + "q", [[0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] +) +@pytest.mark.parametrize( + "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] +) +@pytest.mark.parametrize( + "null_precedence", + [ + [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ], +) +def test_quantiles( + plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence +): + if interp_opt in { + plc.types.Interpolation.LINEAR, + plc.types.Interpolation.MIDPOINT, + }: + pytest.skip( + "interp cannot be an arithmetic interpolation strategy for quantiles" + ) + + q = np.asarray(q, dtype="float64") + + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data) + # TODO: why are column names not preserved by pylibcudf + pa_tbl_data = pa_tbl_data.rename_columns(["a", "b"]) + + res = plc.quantiles.quantiles( + plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence + ) + + pa_interp_opt = interp_mapping[interp_opt] + + if sorted_opt == plc.types.Sorted.NO: + order_mapper = { + plc.types.Order.ASCENDING: "ascending", + plc.types.Order.DESCENDING: "descending", + } + pa_tbl_data = pa_tbl_data.sort_by( + [ + (name, order_mapper[order]) + for name, order in zip(pa_tbl_data.column_names, column_order) + ], + null_placement="at_start" + if null_precedence[0] == plc.types.NullOrder.BEFORE + else "at_end", + ) + row_idxs = pc.quantile( + np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt + ) + exp = pa_tbl_data.take(row_idxs) + + assert_table_eq(res, exp) From 333f44cb70c88c414cf4c9da0998b2c211ba4a3a Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 28 May 2024 16:23:13 -0700 Subject: [PATCH 02/10] fix missing import --- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index 6584c9fc9de..ac90d7311fa 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -12,7 +12,7 @@ from cudf._lib.pylibcudf.libcudf.quantiles cimport ( quantiles as cpp_quantiles, ) from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.types cimport order, sorted +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted from .column cimport Column from .table cimport Table From 3e99f980b9a37cc82e5c161fed590296440d2653 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 3 Jun 2024 23:57:36 +0000 Subject: [PATCH 03/10] address some comments --- cpp/src/quantiles/quantiles.cu | 4 +- cpp/tests/quantiles/quantiles_test.cpp | 9 ++- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 7 +- python/cudf/cudf/_lib/pylibcudf/types.pxd | 8 --- python/cudf/cudf/_lib/pylibcudf/types.pyx | 8 +++ python/cudf/cudf/pylibcudf_tests/conftest.py | 4 +- .../cudf/pylibcudf_tests/test_quantiles.py | 68 ++++++++++++------- 7 files changed, 70 insertions(+), 38 deletions(-) diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index c0f536536ce..af3bda2e62e 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -34,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -78,7 +79,8 @@ std::unique_ptr quantiles(table_view const& input, CUDF_EXPECTS(interp == interpolation::HIGHER || interp == interpolation::LOWER || interp == interpolation::NEAREST, - "multi-column quantiles require a non-arithmetic interpolation strategy."); + "multi-column quantiles require a non-arithmetic interpolation strategy.", + std::invalid_argument); CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row."); diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 5b7b6dd2718..b7faa20e8c1 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include +#include + template struct QuantilesTest : public cudf::test::BaseFixture {}; @@ -104,9 +106,10 @@ TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation) cudf::test::fixed_width_column_wrapper input_b({}); auto input = cudf::table_view({input_a}); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), std::invalid_argument); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), + std::invalid_argument); } TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index ac90d7311fa..04c1ef2160b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -33,6 +33,8 @@ cpdef Column quantile( Parameters ---------- + input: Column + The Column to calculate quantiles on. q: array-like that implements buffer-protocol The quantiles to calculate in range [0,1] interp: Interpolation, default Interpolation.LINEAR @@ -97,6 +99,8 @@ cpdef Table quantiles( Parameters ---------- + input: Table + The Table to calculate row quantiles on. q: array-like that implements buffer-protocol The quantiles to calculate in range [0,1] interp: Interpolation, default Interpolation.LINEAR @@ -128,7 +132,8 @@ cpdef Table quantiles( vector[null_order] null_precedence_vec = null_precedence # Copy from memoryview into vector - q_vec.assign(&q[0], &q[0] + len(q)) + if len(q) > 0: + q_vec.assign(&q[0], &q[0] + len(q)) with nogil: c_result = move( diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index 2e0eaa02d7f..e54a259819e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -17,14 +17,6 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( type_id, ) -from cudf._lib.pylibcudf.libcudf.types import \ - interpolation as Interpolation # no-cython-lint -from cudf._lib.pylibcudf.libcudf.types import \ - null_order as NullOrder # no-cython-lint -from cudf._lib.pylibcudf.libcudf.types import \ - sorted as Sorted # no-cython-lint -from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint - cdef class DataType: cdef data_type c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index a5248ad0a1f..821d599901f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -14,6 +14,14 @@ from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cyth from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip +from cudf._lib.pylibcudf.libcudf.types import \ + interpolation as Interpolation # no-cython-lint +from cudf._lib.pylibcudf.libcudf.types import \ + null_order as NullOrder # no-cython-lint +from cudf._lib.pylibcudf.libcudf.types import \ + sorted as Sorted # no-cython-lint +from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint + cdef class DataType: """Indicator for the logical data type of an element in a column. diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 60b9cb553f7..f3c6584ef8c 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -46,7 +46,7 @@ def numeric_pa_type(request): @pytest.fixture( - scope="session", params=[opt.value for opt in plc.types.Interpolation] + scope="session", params=[opt for opt in plc.types.Interpolation] ) def interp_opt(request): return request.param @@ -54,7 +54,7 @@ def interp_opt(request): @pytest.fixture( scope="session", - params=[opt.value for opt in plc.types.Sorted], + params=[opt for opt in plc.types.Sorted], ) def sorted_opt(request): return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py index effe0eaa60d..dc2e0343e4a 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -58,7 +58,7 @@ def plc_tbl_data(request): return plc.interop.from_arrow(pa.Table.from_arrays(**request.param)) -@pytest.mark.parametrize("q", [[0], [0.5], [0.1, 0.5, 0.7, 0.9]]) +@pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) @pytest.mark.parametrize("exact", [True, False]) def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): q = np.asarray(q, dtype="float64") @@ -75,7 +75,11 @@ def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): if exact: pa_col_data = pc.cast(pa_col_data, pa.float64()) - exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt) + if len(q) > 0: + # pyarrow quantile doesn't support empty q + exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt) + else: + exp = pa.array([], type=pa.float64()) if not exact: exp = pc.cast(exp, pa_col_data.type, safe=False) @@ -84,7 +88,7 @@ def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): @pytest.mark.parametrize( - "q", [[0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] + "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] ) @pytest.mark.parametrize( "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] @@ -109,33 +113,51 @@ def test_quantiles( q = np.asarray(q, dtype="float64") - pa_tbl_data = plc.interop.to_arrow(plc_tbl_data) - # TODO: why are column names not preserved by pylibcudf pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) res = plc.quantiles.quantiles( plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence ) - pa_interp_opt = interp_mapping[interp_opt] + if len(q) > 0: + # pyarrow quantile doesn't support empty q + pa_interp_opt = interp_mapping[interp_opt] - if sorted_opt == plc.types.Sorted.NO: - order_mapper = { - plc.types.Order.ASCENDING: "ascending", - plc.types.Order.DESCENDING: "descending", - } - pa_tbl_data = pa_tbl_data.sort_by( - [ - (name, order_mapper[order]) - for name, order in zip(pa_tbl_data.column_names, column_order) - ], - null_placement="at_start" - if null_precedence[0] == plc.types.NullOrder.BEFORE - else "at_end", + if sorted_opt == plc.types.Sorted.NO: + order_mapper = { + plc.types.Order.ASCENDING: "ascending", + plc.types.Order.DESCENDING: "descending", + } + pa_tbl_data = pa_tbl_data.sort_by( + [ + (name, order_mapper[order]) + for name, order in zip( + pa_tbl_data.column_names, column_order + ) + ], + null_placement="at_start" + if null_precedence[0] == plc.types.NullOrder.BEFORE + else "at_end", + ) + row_idxs = pc.quantile( + np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt + ) + exp = pa_tbl_data.take(row_idxs) + else: + exp = pa.Table.from_arrays( + [[] for _ in range(len(pa_tbl_data.schema))], + schema=pa_tbl_data.schema, ) - row_idxs = pc.quantile( - np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt - ) - exp = pa_tbl_data.take(row_idxs) assert_table_eq(exp, res) + + +@pytest.mark.parametrize( + "invalid_interp", + [plc.types.Interpolation.LINEAR, plc.types.Interpolation.MIDPOINT], +) +def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): + with pytest.raises(ValueError): + plc.quantiles.quantiles( + plc_tbl_data, q=np.array([0.1]), interp=invalid_interp + ) From 616e369e96d2d32fc467d0c8bdb59645c5b4c027 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 4 Jun 2024 22:26:05 +0000 Subject: [PATCH 04/10] don't use memoryview --- python/cudf/cudf/_lib/pylibcudf/quantiles.pxd | 5 +- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 50 ++++++++++--------- python/cudf/cudf/_lib/pylibcudf/types.pyx | 8 --- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd index aae96c3f0e0..f560ad5b901 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.vector cimport vector from .column cimport Column from .table cimport Table @@ -7,7 +8,7 @@ from .types cimport interpolation, sorted cpdef Column quantile( Column input, - const double[:] q, + vector[double] q, interpolation interp = *, Column ordered_indices = *, bint exact = * @@ -15,7 +16,7 @@ cpdef Column quantile( cpdef Table quantiles( Table input, - const double[:] q, + vector[double] q, interpolation interp = *, sorted is_input_sorted = *, list column_order = *, diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index 04c1ef2160b..c459d289b75 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -21,7 +21,7 @@ from .types cimport interpolation cpdef Column quantile( Column input, - const double[:] q, + vector[double] q, interpolation interp = interpolation.LINEAR, Column ordered_indices = None, bool exact=True @@ -48,6 +48,8 @@ cpdef Column quantile( exact: bool, default True Returns doubles if True. Otherwise, returns same type as input + For details, see :cpp:func:`quantile`. + Returns ------- Column @@ -55,7 +57,6 @@ cpdef Column quantile( """ cdef: unique_ptr[column] c_result - vector[double] q_vec column_view ordered_indices_view if ordered_indices is None: @@ -63,15 +64,11 @@ cpdef Column quantile( else: ordered_indices_view = ordered_indices.view() - # Copy from memoryview into vector - if len(q) > 0: - q_vec.assign(&q[0], &q[0] + len(q)) - with nogil: c_result = move( cpp_quantile( input.view(), - q_vec, + q, interp, ordered_indices_view, exact, @@ -83,13 +80,11 @@ cpdef Column quantile( cpdef Table quantiles( Table input, - const double[:] q, + vector[double] q, interpolation interp = interpolation.NEAREST, sorted is_input_sorted = sorted.NO, - # cython-lint complains that this a dangerous default value but - # we don't modify these parameters, and so should be good to go - list column_order = [], # no-cython-lint - list null_precedence = [], # no-cython-lint + list column_order = None, + list null_precedence = None, ): """Computes row quantiles with interpolation. @@ -101,7 +96,7 @@ cpdef Table quantiles( ---------- input: Table The Table to calculate row quantiles on. - q: array-like that implements buffer-protocol + q: array-like The quantiles to calculate in range [0,1] interp: Interpolation, default Interpolation.LINEAR The strategy used to select between values adjacent to a specified quantile. @@ -111,15 +106,22 @@ cpdef Table quantiles( {`Interpolation.HIGHER`, `Interpolation.LOWER`, `Interpolation.NEAREST`}) is_input_sorted: Sorted, default Sorted.NO Whether the input table has been pre-sorted or not. - column_order: list, default [] - A list of `Order` enums, indicating the desired sort order for each column. + column_order: list, default None + A list of :py:class:`~cudf._lib.pylibcudf.types.Order` enums, + indicating the desired sort order for each column. + By default, will sort all columns so that they are in ascending order. Ignored if `is_input_sorted` is `Sorted.YES` - null_precedence: list, default [] - A list of `NullOrder` enums, indicating how nulls should be sorted. + null_precedence: list, default None + A list of :py:class:`~cudf._lib.pylibcudf.types.NullOrder` enums, + indicating how nulls should be sorted. + By default, will sort all columns so that nulls appear before + all other elements. Ignored if `is_input_sorted` is `Sorted.YES` + For details, see :cpp:func:`quantiles`. + Returns ------- Column @@ -127,19 +129,19 @@ cpdef Table quantiles( """ cdef: unique_ptr[table] c_result - vector[double] q_vec - vector[order] column_order_vec = column_order - vector[null_order] null_precedence_vec = null_precedence + vector[order] column_order_vec + vector[null_order] null_precedence_vec - # Copy from memoryview into vector - if len(q) > 0: - q_vec.assign(&q[0], &q[0] + len(q)) + if column_order is not None: + column_order_vec = column_order + if null_precedence is not None: + null_precedence_vec = null_precedence with nogil: c_result = move( cpp_quantiles( input.view(), - q_vec, + q, interp, is_input_sorted, column_order_vec, diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index 821d599901f..a5248ad0a1f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -14,14 +14,6 @@ from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cyth from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip -from cudf._lib.pylibcudf.libcudf.types import \ - interpolation as Interpolation # no-cython-lint -from cudf._lib.pylibcudf.libcudf.types import \ - null_order as NullOrder # no-cython-lint -from cudf._lib.pylibcudf.libcudf.types import \ - sorted as Sorted # no-cython-lint -from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint - cdef class DataType: """Indicator for the logical data type of an element in a column. From 77ddf7186e0f1e5f32af3eabf0003107e289f996 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 4 Jun 2024 22:29:19 +0000 Subject: [PATCH 05/10] another one --- python/cudf/cudf/_lib/pylibcudf/quantiles.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd index f560ad5b901..70ff135ca77 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.vector cimport vector +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted + from .column cimport Column from .table cimport Table -from .types cimport interpolation, sorted cpdef Column quantile( From 0259dbc28a94d33af5e85d62e4104134ea52308c Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 4 Jun 2024 23:02:39 +0000 Subject: [PATCH 06/10] More tests --- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 2 +- python/cudf/cudf/_lib/quantiles.pyx | 105 ++++++++++---- .../cudf/cudf/core/column/numerical_base.py | 3 - python/cudf/cudf/core/frame.py | 2 - .../cudf/pylibcudf_tests/test_quantiles.py | 133 ++++++++++++++---- 5 files changed, 182 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index c459d289b75..1f3b5ca82a5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -98,7 +98,7 @@ cpdef Table quantiles( The Table to calculate row quantiles on. q: array-like The quantiles to calculate in range [0,1] - interp: Interpolation, default Interpolation.LINEAR + interp: Interpolation, default Interpolation.NEAREST The strategy used to select between values adjacent to a specified quantile. Must be a non-arithmetic interpolation strategy diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index bd250f2ab45..3d20454a7ce 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -3,69 +3,122 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.types cimport ( underlying_type_t_interpolation, + underlying_type_t_null_order, + underlying_type_t_order, underlying_type_t_sorted, ) from cudf._lib.types import Interpolation -from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view +from cudf._lib.pylibcudf.libcudf.quantiles cimport ( + quantile as cpp_quantile, + quantiles as cpp_quantile_table, +) +from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view +from cudf._lib.pylibcudf.libcudf.types cimport ( + interpolation, + null_order, + order, + sorted, +) +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @acquire_spill_lock() def quantile( Column input, - double[:] q, + object q, str interp, Column ordered_indices, bool exact, + ): + cdef column_view c_input = input.view() + cdef column_view c_ordered_indices = ( + column_view() if ordered_indices is None + else ordered_indices.view() + ) cdef interpolation c_interp = ( Interpolation[interp.upper()] ) + cdef bool c_exact = exact - return Column.from_pylibcudf( - plc.quantiles.quantile( - input.to_pylibcudf(mode="read"), - q, - c_interp, - ordered_indices.to_pylibcudf(mode="read"), - exact + cdef vector[double] c_q + c_q.reserve(len(q)) + + for value in q: + c_q.push_back(value) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_quantile( + c_input, + c_q, + c_interp, + c_ordered_indices, + c_exact, + ) ) - ) + + return Column.from_unique_ptr(move(c_result)) def quantile_table( list source_columns, - double[:] q, + vector[double] q, object interp, object is_input_sorted, list column_order, list null_precedence, ): - + cdef table_view c_input = table_view_from_columns(source_columns) + cdef vector[double] c_q = q cdef interpolation c_interp = ( interp ) cdef sorted c_is_input_sorted = ( is_input_sorted ) + cdef vector[order] c_column_order + cdef vector[null_order] c_null_precedence + + c_column_order.reserve(len(column_order)) + c_null_precedence.reserve(len(null_precedence)) - return columns_from_pylibcudf_table( - plc.quantiles.quantiles( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - q, - c_interp, - c_is_input_sorted, - column_order, - null_precedence + for value in column_order: + c_column_order.push_back( + ( value) ) - ) + + for value in null_precedence: + c_null_precedence.push_back( + ( value) + ) + + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_quantile_table( + c_input, + c_q, + c_interp, + c_is_input_sorted, + c_column_order, + c_null_precedence, + ) + ) + + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index ce1f1cdf521..541c32a2520 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -116,9 +116,6 @@ def quantile( indices = libcudf.sort.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) - - q = np.asarray(q, dtype="float64") - result = libcudf.quantiles.quantile( self, q, interpolation, indices, exact ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3f1a59c919e..d60c206ac24 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -833,8 +833,6 @@ def _quantile_table( libcudf.types.NullOrder[key] for key in null_precedence ] - q = np.asarray(q, dtype="float64") - return self._from_columns_like_self( libcudf.quantiles.quantile_table( [*self._columns], diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py index dc2e0343e4a..a5d332a7795 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -61,8 +61,6 @@ def plc_tbl_data(request): @pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) @pytest.mark.parametrize("exact", [True, False]) def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): - q = np.asarray(q, dtype="float64") - ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) @@ -87,38 +85,23 @@ def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): assert_column_eq(exp, res) -@pytest.mark.parametrize( - "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] -) -@pytest.mark.parametrize( - "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] -) -@pytest.mark.parametrize( - "null_precedence", - [ - [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE], - [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], - ], -) -def test_quantiles( - plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence +def _pyarrow_quantiles( + pa_tbl_data, + q, + interp_opt=plc.types.Interpolation.NEAREST, + sorted_opt=plc.types.Sorted.NO, + column_order=None, + null_precedence=None, ): - if interp_opt in { - plc.types.Interpolation.LINEAR, - plc.types.Interpolation.MIDPOINT, - }: - pytest.skip( - "interp cannot be an arithmetic interpolation strategy for quantiles" - ) - - q = np.asarray(q, dtype="float64") - - pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + """ + The pyarrow equivalent of plc.quantiles.quantiles - res = plc.quantiles.quantiles( - plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence - ) + Takes the same arguments (except input should be a pyarrow table instead of + of a pylibcudf table) + NOTE: This function doesn't support having different null precedences because of + a lack of support in pyarrow. + """ if len(q) > 0: # pyarrow quantile doesn't support empty q pa_interp_opt = interp_mapping[interp_opt] @@ -128,6 +111,25 @@ def test_quantiles( plc.types.Order.ASCENDING: "ascending", plc.types.Order.DESCENDING: "descending", } + if null_precedence is None: + null_precedence = [plc.types.NullOrder.BEFORE] * len( + pa_tbl_data.columns + ) + if column_order is None: + column_order = [plc.types.Order.ASCENDING] * len( + pa_tbl_data.columns + ) + + if not all( + [ + null_prec == null_precedence[0] + for null_prec in null_precedence + ] + ): + raise NotImplementedError( + "Having varying null precendences is not implemented!" + ) + pa_tbl_data = pa_tbl_data.sort_by( [ (name, order_mapper[order]) @@ -148,6 +150,47 @@ def test_quantiles( [[] for _ in range(len(pa_tbl_data.schema))], schema=pa_tbl_data.schema, ) + return exp + + +@pytest.mark.parametrize( + "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] +) +@pytest.mark.parametrize( + "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] +) +@pytest.mark.parametrize( + "null_precedence", + [ + [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ], +) +def test_quantiles( + plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence +): + if interp_opt in { + plc.types.Interpolation.LINEAR, + plc.types.Interpolation.MIDPOINT, + }: + pytest.skip( + "interp cannot be an arithmetic interpolation strategy for quantiles" + ) + + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + + exp = _pyarrow_quantiles( + pa_tbl_data, + q=q, + interp_opt=interp_opt, + sorted_opt=sorted_opt, + column_order=column_order, + null_precedence=null_precedence, + ) + + res = plc.quantiles.quantiles( + plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence + ) assert_table_eq(exp, res) @@ -161,3 +204,31 @@ def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): plc.quantiles.quantiles( plc_tbl_data, q=np.array([0.1]), interp=invalid_interp ) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantile_q_array_like(pa_col_data, plc_col_data, q): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, + q=q, + ordered_indices=ordered_indices, + ) + exp = pc.quantile(pa_col_data, q=q) + assert_column_eq(exp, res) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantiles_q_array_like(plc_tbl_data, q): + res = plc.quantiles.quantiles(plc_tbl_data, q=q) + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + exp = _pyarrow_quantiles(pa_tbl_data, q=q) + assert_table_eq(exp, res) From d9766356d364610ca5f008593b6e37b1502a3678 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 Jun 2024 15:45:20 +0000 Subject: [PATCH 07/10] address comments and revert accidental change --- python/cudf/cudf/_lib/quantiles.pyx | 102 +++++++--------------------- 1 file changed, 25 insertions(+), 77 deletions(-) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 3d20454a7ce..7b50c00919a 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -3,76 +3,43 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.types cimport ( underlying_type_t_interpolation, - underlying_type_t_null_order, - underlying_type_t_order, underlying_type_t_sorted, ) from cudf._lib.types import Interpolation -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.quantiles cimport ( - quantile as cpp_quantile, - quantiles as cpp_quantile_table, -) -from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view -from cudf._lib.pylibcudf.libcudf.types cimport ( - interpolation, - null_order, - order, - sorted, -) -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted +from cudf._lib.utils cimport columns_from_pylibcudf_table + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def quantile( Column input, - object q, + vector[double] q, str interp, Column ordered_indices, bool exact, - ): - cdef column_view c_input = input.view() - cdef column_view c_ordered_indices = ( - column_view() if ordered_indices is None - else ordered_indices.view() - ) cdef interpolation c_interp = ( Interpolation[interp.upper()] ) - cdef bool c_exact = exact - - cdef vector[double] c_q - c_q.reserve(len(q)) - - for value in q: - c_q.push_back(value) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_quantile( - c_input, - c_q, - c_interp, - c_ordered_indices, - c_exact, - ) + return Column.from_pylibcudf( + plc.quantiles.quantile( + input.to_pylibcudf(mode="read"), + q, + c_interp, + ordered_indices.to_pylibcudf(mode="read"), + exact ) - - return Column.from_unique_ptr(move(c_result)) + ) def quantile_table( @@ -83,42 +50,23 @@ def quantile_table( list column_order, list null_precedence, ): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef vector[double] c_q = q + cdef interpolation c_interp = ( interp ) cdef sorted c_is_input_sorted = ( is_input_sorted ) - cdef vector[order] c_column_order - cdef vector[null_order] c_null_precedence - - c_column_order.reserve(len(column_order)) - c_null_precedence.reserve(len(null_precedence)) - - for value in column_order: - c_column_order.push_back( - ( value) - ) - for value in null_precedence: - c_null_precedence.push_back( - ( value) + return columns_from_pylibcudf_table( + plc.quantiles.quantiles( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]), + q, + c_interp, + c_is_input_sorted, + column_order, + null_precedence ) - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_quantile_table( - c_input, - c_q, - c_interp, - c_is_input_sorted, - c_column_order, - c_null_precedence, - ) - ) - - return columns_from_unique_ptr(move(c_result)) + ) From 17169c9ed75f679d4e8d30b08b531397a83ebf70 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:52:03 -0700 Subject: [PATCH 08/10] Update quantiles.pyx --- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index 1f3b5ca82a5..b317966846e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -107,13 +107,13 @@ cpdef Table quantiles( is_input_sorted: Sorted, default Sorted.NO Whether the input table has been pre-sorted or not. column_order: list, default None - A list of :py:class:`~cudf._lib.pylibcudf.types.Order` enums, + A list of :py:class:`~.types.Order` enums, indicating the desired sort order for each column. By default, will sort all columns so that they are in ascending order. Ignored if `is_input_sorted` is `Sorted.YES` null_precedence: list, default None - A list of :py:class:`~cudf._lib.pylibcudf.types.NullOrder` enums, + A list of :py:class:`~.types.NullOrder` enums, indicating how nulls should be sorted. By default, will sort all columns so that nulls appear before all other elements. From 026964a3b72b605cc6b0c9e903455601ce758761 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 Jun 2024 18:42:54 +0000 Subject: [PATCH 09/10] Revert "Update quantiles.pyx" This reverts commit 17169c9ed75f679d4e8d30b08b531397a83ebf70. --- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index b317966846e..1f3b5ca82a5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -107,13 +107,13 @@ cpdef Table quantiles( is_input_sorted: Sorted, default Sorted.NO Whether the input table has been pre-sorted or not. column_order: list, default None - A list of :py:class:`~.types.Order` enums, + A list of :py:class:`~cudf._lib.pylibcudf.types.Order` enums, indicating the desired sort order for each column. By default, will sort all columns so that they are in ascending order. Ignored if `is_input_sorted` is `Sorted.YES` null_precedence: list, default None - A list of :py:class:`~.types.NullOrder` enums, + A list of :py:class:`~cudf._lib.pylibcudf.types.NullOrder` enums, indicating how nulls should be sorted. By default, will sort all columns so that nulls appear before all other elements. From c45d183c6f94fbc1faf5b23f0d5bb07f3609d46c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:39:34 -0700 Subject: [PATCH 10/10] don't reference --- python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx index 1f3b5ca82a5..c1f0e30ccd3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -107,13 +107,13 @@ cpdef Table quantiles( is_input_sorted: Sorted, default Sorted.NO Whether the input table has been pre-sorted or not. column_order: list, default None - A list of :py:class:`~cudf._lib.pylibcudf.types.Order` enums, + A list of `Order` enums, indicating the desired sort order for each column. By default, will sort all columns so that they are in ascending order. Ignored if `is_input_sorted` is `Sorted.YES` null_precedence: list, default None - A list of :py:class:`~cudf._lib.pylibcudf.types.NullOrder` enums, + A list of `NullOrder` enums, indicating how nulls should be sorted. By default, will sort all columns so that nulls appear before all other elements.