diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 50061f6e468..f6d9c8c404c 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -3,23 +3,9 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr -from libcpp.utility cimport move from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( - lists_column_view, -) -from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( - distinct as cpp_distinct, -) -from cudf._lib.pylibcudf.libcudf.types cimport ( - nan_equality, - null_equality, - null_order, - size_type, -) +from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib import pylibcudf @@ -47,31 +33,13 @@ def explode_outer(list source_columns, int explode_column_idx): @acquire_spill_lock() def distinct(Column col, bool nulls_equal, bool nans_all_equal): - """ - nulls_equal == True indicates that libcudf should treat any two nulls as - equal, and as unequal otherwise. - nans_all_equal == True indicates that libcudf should treat any two - elements from {-nan, +nan} as equal, and as unequal otherwise. - """ - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) - ) - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL - ) - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_distinct(list_view.get()[0], - c_nulls_equal, - c_nans_equal) + return Column.from_pylibcudf( + pylibcudf.lists.distinct( + col.to_pylibcudf(mode="read"), + nulls_equal, + nans_all_equal, ) - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd index 22b91df7192..b1fcf7800b0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd @@ -11,8 +11,13 @@ from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality cdef extern from "cudf/lists/stream_compaction.hpp" \ namespace "cudf::lists" nogil: + cdef unique_ptr[column] apply_boolean_mask( + const lists_column_view& lists_column, + const lists_column_view& boolean_mask, + ) except + + cdef unique_ptr[column] distinct( - const lists_column_view lists_column, + const lists_column_view& lists_column, null_equality nulls_equal, nan_equality nans_equal ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 4e2406c2aea..17619b489d2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -47,3 +47,7 @@ cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*) cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) + +cpdef Column apply_boolean_mask(Column, Column) + +cpdef Column distinct(Column, bool, bool) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 7555c8c6970..c944fc35800 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -29,6 +29,10 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport ( sort_lists as cpp_sort_lists, stable_sort_lists as cpp_stable_sort_lists, ) +from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( + apply_boolean_mask as cpp_apply_boolean_mask, + distinct as cpp_distinct, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport ( nan_equality, @@ -614,3 +618,70 @@ cpdef Column union_distinct( c_nans_equal, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column apply_boolean_mask(Column input, Column boolean_mask): + """Filters elements in each row of the input lists column using a boolean mask + + For details, see :cpp:func:`apply_boolean_mask`. + + Parameters + ---------- + input : Column + The input column. + boolean_mask : Column + The boolean mask. + + Returns + ------- + Column + A Column of filtered elements based upon the boolean mask. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + cdef ListColumnView mask_view = boolean_mask.list_view() + with nogil: + c_result = move(cpp_apply_boolean_mask( + list_view.view(), + mask_view.view(), + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): + """Create a new list column without duplicate elements in each list. + + For details, see :cpp:func:`distinct`. + + Parameters + ---------- + input : Column + The input column. + nulls_equal : bool + If true, null elements are considered equal. Otherwise, unequal. + nans_equal : bool + If true, libcudf will treat nan elements from {-nan, +nan} + as equal. Otherwise, unequal. Otherwise, unequal. + + Returns + ------- + Column + A new list column without duplicate elements in each list. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL + ) + + with nogil: + c_result = move(cpp_distinct( + list_view.view(), + c_nulls_equal, + c_nans_equal, + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index f135ab4ccff..33f95a7d364 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -13,16 +13,26 @@ def test_data(): return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]] +@pytest.fixture +def list_column(): + return [[0, 1], [2], [5], [6, 7]] + + @pytest.fixture def scalar(): return pa.scalar(1) @pytest.fixture -def column(): +def search_key_column(): return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32()) +@pytest.fixture +def bool_column(): + return pa.array([[False, True], [True], [True], [True, True]]) + + @pytest.fixture def set_lists_column(): lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]] @@ -72,8 +82,7 @@ def test_concatenate_list_elements(test_data, dropna, expected): assert_column_eq(expect, res) -def test_contains_scalar(test_data, scalar): - list_column = test_data[0][0] +def test_contains_scalar(list_column, scalar): arr = pa.array(list_column) plc_column = plc.interop.from_arrow(arr) @@ -85,9 +94,9 @@ def test_contains_scalar(test_data, scalar): assert_column_eq(expect, res) -def test_contains_list_column(test_data): - list_column1 = test_data[0][0] - list_column2 = [1, 3, 5, 1] +def test_contains_list_column(list_column, search_key_column): + list_column1 = list_column + list_column2, _ = search_key_column arr1 = pa.array(list_column1) arr2 = pa.array(list_column2) @@ -95,7 +104,7 @@ def test_contains_list_column(test_data): plc_column2 = plc.interop.from_arrow(arr2) res = plc.lists.contains(plc_column1, plc_column2) - expect = pa.array([True, False, True, False]) + expect = pa.array([False, True, True, True]) assert_column_eq(expect, res) @@ -123,8 +132,7 @@ def test_contains_nulls(list_column, expected): assert_column_eq(expect, res) -def test_index_of_scalar(test_data, scalar): - list_column = test_data[0][0] +def test_index_of_scalar(list_column, scalar): arr = pa.array(list_column) plc_column = plc.interop.from_arrow(arr) @@ -136,21 +144,19 @@ def test_index_of_scalar(test_data, scalar): assert_column_eq(expect, res) -def test_index_of_list_column(test_data, column): - list_column = test_data[0][0] +def test_index_of_list_column(list_column, search_key_column): arr1 = pa.array(list_column) - arr2, expect = column + arr2, expect = search_key_column plc_column1 = plc.interop.from_arrow(arr1) plc_column2 = plc.interop.from_arrow(arr2) res = plc.lists.index_of(plc_column1, plc_column2, True) - expect = pa.array(column[1], type=pa.int32()) + expect = pa.array(search_key_column[1], type=pa.int32()) assert_column_eq(expect, res) -def test_reverse(test_data): - list_column = test_data[0][0] +def test_reverse(list_column): arr = pa.array(list_column) plc_column = plc.interop.from_arrow(arr) @@ -162,8 +168,7 @@ def test_reverse(test_data): def test_segmented_gather(test_data): - list_column1 = test_data[0][0] - list_column2 = test_data[0][1] + list_column1, list_column2 = test_data[0] plc_column1 = plc.interop.from_arrow(pa.array(list_column1)) plc_column2 = plc.interop.from_arrow(pa.array(list_column2)) @@ -175,19 +180,17 @@ def test_segmented_gather(test_data): assert_column_eq(expect, res) -def test_extract_list_element_scalar(test_data): - arr = pa.array(test_data[0][0]) - plc_column = plc.interop.from_arrow(arr) +def test_extract_list_element_scalar(list_column): + plc_column = plc.interop.from_arrow(pa.array(list_column)) res = plc.lists.extract_list_element(plc_column, 0) - expect = pa.compute.list_element(test_data[0][0], 0) + expect = pa.compute.list_element(list_column, 0) assert_column_eq(expect, res) -def test_extract_list_element_column(test_data): - arr = pa.array(test_data[0][0]) - plc_column = plc.interop.from_arrow(arr) +def test_extract_list_element_column(list_column): + plc_column = plc.interop.from_arrow(pa.array(list_column)) indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1])) res = plc.lists.extract_list_element(plc_column, indices) @@ -343,3 +346,46 @@ def test_set_operations( else: expect = pa.array(expected) assert_column_eq(expect, res) + + +@pytest.mark.parametrize( + "nans_equal,nulls_equal,expected", + [ + (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), + ( + False, + True, + [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], + ), + ( + True, + False, + [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], + ), + ( + False, + False, + [ + [np.nan, np.nan, 0, 1, 2, 3], + [3, 1, 2], + None, + [4, None, None, 5], + ], + ), + ], +) +def test_distinct(list_column, nans_equal, nulls_equal, expected): + list_column = [ + [np.nan, np.nan, 0, 1, 2, 3, 2], + [3, 1, 2], + None, + [4, None, None, 5], + ] + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.distinct(plc_column, nans_equal, nulls_equal) + + expect = pa.array(expected) + + assert_column_eq(expect, res)