diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 93a815838b7..7a12db927e5 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -68,8 +68,18 @@ def emoji_failed(x): pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() diff_df = pr_df - main_df +total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] +pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) +pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) -pr_df = pr_df[["total", "passed", "failed", "skipped"]] +cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) +gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) + +# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns +pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' +pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' + +pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] diff_df = diff_df[["total", "passed", "failed", "skipped"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) @@ -95,6 +105,8 @@ def emoji_failed(x): print(comment) print() +print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print() print("Here are the results of running the Pandas tests against this PR:") print() print(df.to_markdown()) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index c6228a4ef33..1cc6aab242e 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -78,6 +78,8 @@ python -m ipykernel install --user --name python3 # The third-party integration tests are ignored because they are run nightly in seperate CI job python -m pytest -p cudf.pandas \ --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ + --numprocesses=8 \ + --dist=worksteal \ --cov-config=./python/cudf/.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index be55b49870f..b0346327319 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -45,6 +45,8 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh DEPENDENCIES=( cudf cudf_kafka + cugraph + cuml custreamz dask-cuda dask-cudf @@ -57,7 +59,7 @@ DEPENDENCIES=( rmm ) for DEP in "${DEPENDENCIES[@]}"; do - for FILE in dependencies.yaml conda/environments/*.yaml; do + for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" done for FILE in python/*/pyproject.toml; do diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 28ded2f8e0f..a701bfe15e0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf" pushd python/pylibcudf/pylibcudf/tests python -m pytest \ --cache-clear \ + --numprocesses=8 \ --dist=worksteal \ . popd diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 0d39807d56c..361a42ccda9 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd @@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index b257eef1e9e..4255faea702 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation); * * @tparam F Type of callable * @param k The `aggregation::Kind` value to dispatch - * aram f The callable that accepts an `aggregation::Kind` non-type template - * argument. + * @param f The callable that accepts an `aggregation::Kind` callable function object. * @param args Parameter pack forwarded to the `operator()` invocation * @return Forwards the return value of the callable. */ @@ -1626,6 +1625,7 @@ struct dispatch_source { * parameter of the callable `F` * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind` * non-type template parameter for the second template parameter of the callable + * @param f The callable that accepts `data_type` and `aggregation::Kind` function object. * @param args Parameter pack forwarded to the `operator()` invocation * `F`. */ @@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d * @brief Returns the target `data_type` for the specified aggregation k * performed on elements of type source_type. * - * aram source_type The element type to be aggregated - * aram k The aggregation + * @param source_type The element type to be aggregated + * @param k The aggregation kind * @return data_type The target_type of k performed on source_type * elements */ diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 573101cefd9..86402a0e7de 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -50,6 +51,11 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; +using semi_map_type = cuco::legacy::static_map>; + using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 89c13285cfe..19701816867 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,7 +25,6 @@ #include #include -#include namespace cudf { namespace detail { @@ -161,38 +160,6 @@ struct pair_expression_equality : public expression_equality { } }; -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -struct double_row_equality_comparator { - row_equality const equality_comparator; - row_equality const conditional_comparator; - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } -}; - -// A CUDA Cooperative Group of 4 threads for the hash set. -auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4; - -// The hash set type used by mixed_semi_join with the build_table. -using hash_set_type = cuco::static_set, - cuda::thread_scope_device, - double_row_equality_comparator, - cuco::linear_probing, - cudf::detail::cuco_allocator, - cuco::storage<1>>; - -// The hash_set_ref_type used by mixed_semi_join kerenels for probing. -using hash_set_ref_type = hash_set_type::ref_type; - } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index f2c5ff13638..7459ac3e99c 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,16 +38,12 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, + row_hash const hash_probe, row_equality const equality_probe, - hash_set_ref_type set_ref, + cudf::detail::semi_map_type::device_view hash_table_view, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { - auto constexpr cg_size = hash_set_ref_type::cg_size; - - auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between @@ -56,24 +52,24 @@ CUDF_KERNEL void __launch_bounds__(block_size) cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates]; + &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; + + cudf::size_type const left_num_rows = left_table.num_rows(); + cudf::size_type const right_num_rows = right_table.num_rows(); + auto const outer_num_rows = left_num_rows; - cudf::size_type const outer_num_rows = left_table.num_rows(); - auto const outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; + cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; auto evaluator = cudf::ast::detail::expression_evaluator( left_table, right_table, device_expression_data); if (outer_row_index < outer_num_rows) { - // Make sure to swap_tables here as hash_set will use probe table as the left one. - auto constexpr swap_tables = true; // Figure out the number of elements for this key. auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; + evaluator, thread_intermediate_storage, false, equality_probe}; - auto const set_ref_equality = set_ref.with_key_eq(equality); - auto const result = set_ref_equality.contains(tile, outer_row_index); - if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result; + left_table_keep_mask[outer_row_index] = + hash_table_view.contains(outer_row_index, hash_probe, equality); } } @@ -82,8 +78,9 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, + row_hash const hash_probe, row_equality const equality_probe, - hash_set_ref_type set_ref, + cudf::detail::semi_map_type::device_view hash_table_view, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -97,8 +94,9 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, + hash_probe, equality_probe, - set_ref, + hash_table_view, left_table_keep_mask, device_expression_data); } else { @@ -108,8 +106,9 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, + hash_probe, equality_probe, - set_ref, + hash_table_view, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index b08298e64e4..43714ffb36a 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,8 +45,9 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. + * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] set_ref The hash table device view built from `build`. + * @param[in] hash_table_view The hash table built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -57,8 +58,9 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, + row_hash const hash_probe, row_equality const equality_probe, - hash_set_ref_type set_ref, + cudf::detail::semi_map_type::device_view hash_table_view, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 719b1d47105..cfb785e242c 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -46,6 +46,45 @@ namespace cudf { namespace detail { +namespace { +/** + * @brief Device functor to create a pair of hash value and index for a given row. + */ +struct make_pair_function_semi { + __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept + { + // The value is irrelevant since we only ever use the hash map to check for + // membership of a particular row index. + return cuco::make_pair(static_cast(i), 0); + } +}; + +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +class double_row_equality { + public: + double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) + : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} + { + } + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } + + private: + row_equality _equality_comparator; + row_equality _conditional_comparator; +}; + +} // namespace + std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -57,7 +96,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -98,7 +137,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or + cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -117,20 +156,27 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - cudf::experimental::row::equality::preprocessed_table::create(build, stream); + experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); + semi_map_type hash_table{ + compute_hash_table_size(build.num_rows()), + cuco::empty_key{std::numeric_limits::max()}, + cuco::empty_value{cudf::detail::JoinNoneValue}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - + auto const hash_build = row_hash_build.device_hasher(build_nulls); // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -145,28 +191,20 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); + experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); + double_row_equality equality_build{equality_build_equality, equality_build_conditional}; + make_pair_function_semi pair_func_build{}; - hash_set_type row_set{ - {compute_hash_table_size(build.num_rows())}, - cuco::empty_key{JoinNoneValue}, - {equality_build_equality, equality_build_conditional}, - {row_hash_build.device_hasher(build_nulls)}, - {}, - {}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - {stream.value()}}; - - auto iter = thrust::make_counting_iterator(0); + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - row_set.insert(iter, iter + right_num_rows, stream.value()); + hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -174,19 +212,18 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value()); + hash_table.insert_if( + iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); } + auto hash_table_view = hash_table.get_device_view(); + detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = - parser.shmem_per_thread * - cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); + auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); - hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); - // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -195,8 +232,9 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, + hash_probe, equality_probe, - row_set_ref, + hash_table_view, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d3a7ce5a4e..9824c472b20 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -132,6 +132,13 @@ struct cuda_event { cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } + // Moveable but not copyable. + cuda_event(const cuda_event&) = delete; + cuda_event& operator=(const cuda_event&) = delete; + + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + operator cudaEvent_t() { return e_; } private: @@ -147,11 +154,12 @@ struct cuda_event { */ cudaEvent_t event_for_thread() { - thread_local std::vector> thread_events(get_num_cuda_devices()); + // The program may crash if this function is called from the main thread and user application + // subsequently calls cudaDeviceReset(). + // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. + thread_local std::vector thread_events(get_num_cuda_devices()); auto const device_id = get_current_cuda_device(); - if (not thread_events[device_id.value()]) { - thread_events[device_id.value()] = std::make_unique(); - } + if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } return *thread_events[device_id.value()]; } diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 08a0136700d..6c147c8a128 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,21 +778,6 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } -TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) -{ - auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); - auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_one_greater_right_one = - cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); - - this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, - {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, - {0}, - {1}, - left_one_greater_right_one, - {2, 7, 8}); -} - TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -915,18 +900,3 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } - -TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) -{ - auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); - auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_one_greater_right_one = - cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); - - this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, - {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, - {0}, - {1}, - left_one_greater_right_one, - {0, 1, 3, 4, 5, 6, 9}); -} diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst new file mode 100644 index 00000000000..06f74a38709 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst @@ -0,0 +1,6 @@ +======= +extract +======= + +.. automodule:: pylibcudf.strings.extract + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..2518afc80a7 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,8 +7,10 @@ strings capitalize char_types contains + extract find regex_flags regex_program + repeat replace slice diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst new file mode 100644 index 00000000000..0041fe4c3da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst @@ -0,0 +1,6 @@ +====== +repeat +====== + +.. automodule:: pylibcudf.strings.repeat + :members: diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini index db24415ef9e..187d91996b2 100644 --- a/python/cudf/benchmarks/pytest.ini +++ b/python/cudf/benchmarks/pytest.ini @@ -6,3 +6,4 @@ python_classes = Bench python_functions = bench_* markers = pandas_incompatible: mark a benchmark that cannot be run with pandas +addopts = --tb=native diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index e661059faa3..e6c2d136f0d 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -23,9 +23,9 @@ def concat_columns(object columns): def concat_tables(object tables, bool ignore_index=False): plc_tables = [] for table in tables: - cols = table._data.columns + cols = table._columns if not ignore_index: - cols = table._index._data.columns + cols + cols = table._index._columns + cols plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) return data_from_pylibcudf_table( diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 16182e31c08..49714091f46 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -384,7 +384,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} - for name, col in input_table._data.items(): + for name, col in input_table._column_labels_and_values: if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 058e884e08b..9ad96f610b3 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -273,7 +273,7 @@ def read_csv( elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._data.names[index] + col_name = df._column_names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) if names is not None and len(names) and isinstance(names[0], int): diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index b1900138d94..564daefbae2 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -179,7 +179,7 @@ cdef update_struct_field_names( ): # Deprecated, remove in favor of add_col_struct_names # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._data.items()): + for i, (name, col) in enumerate(table._column_labels_and_values): table._data[name] = update_column_struct_field_names( col, schema_info[i] ) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index e6c9d60b05b..fa2690c7f21 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -235,16 +235,16 @@ cdef object _process_metadata(object df, df._index = idx elif set(index_col).issubset(names): index_data = df[index_col] - actual_index_names = list(index_col_names.values()) - if len(index_data._data) == 1: + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: idx = cudf.Index._from_column( - index_data._data.columns[0], - name=actual_index_names[0] + index_data._columns[0], + name=next(actual_index_names) ) else: idx = cudf.MultiIndex.from_frame( index_data, - names=actual_index_names + names=list(actual_index_names) ) df.drop(columns=index_col, inplace=True) df._index = idx @@ -252,7 +252,7 @@ cdef object _process_metadata(object df, if use_pandas_metadata: df.index.names = index_col - if len(df._data.names) == 0 and column_index_type is not None: + if df._num_columns == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) return df diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 82f5e06c547..03b4887f200 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,27 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.contains cimport ( - count_re as cpp_count_re, - like as cpp_like, - matches_re as cpp_matches_re, -) -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar from pylibcudf.strings import contains from pylibcudf.strings.regex_program import RegexProgram @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with count of occurrences of `reg_ex` in each string of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_count_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with each value True if the string matches `reg_ex` regular expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_matches_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape): Returns a Column with each value True if the string matches the `py_pattern` like expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef DeviceScalar pattern = py_pattern.device_value - cdef DeviceScalar escape = py_escape.device_value - - cdef const string_scalar* scalar_ptn = ( - pattern.get_raw_ptr() - ) - cdef const string_scalar* scalar_esc = ( - escape.get_raw_ptr() + plc_column = contains.like( + source_strings.to_pylibcudf(mode="read"), + py_pattern.device_value.c_value, + py_escape.device_value.c_value, ) - - with nogil: - c_result = move(cpp_like( - source_view, - scalar_ptn[0], - scalar_esc[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 63f4d57e562..5bf336f4f3c 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,21 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags): The returning data contains one row for each subject string, and one column for each group. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_extract( - source_view, - dereference(c_prog) - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) + plc_result = plc.strings.extract.extract( + source_strings.to_pylibcudf(mode="read"), prog ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx index 42fcfa5d94e..43649d4defe 100644 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -1,17 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def repeat_scalar(Column source_strings, @@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repeats_view = repeats.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cae28d02ef4..8660cca9322 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*: If True, don't include the index in the columns. """ return table_view_from_columns( - tbl._index._data.columns + tbl._data.columns + tbl._index._columns + tbl._columns if not ignore_index and tbl._index is not None - else tbl._data.columns + else tbl._columns ) @@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index): index_descriptors = [] columns_to_convert = list(table._columns) # Columns - for name, col in table._data.items(): + for name, col in table._column_labels_and_values: if cudf.get_option("mode.pandas_compatible"): # in pandas-compat mode, non-string column names are stringified. col_names.append(str(name)) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ff114474aa4..a6abd63d042 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1951,7 +1951,7 @@ def drop_duplicates( return self._from_columns_like_self( drop_duplicates( list(self._columns), - keys=range(len(self._data)), + keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, ), diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e059917b0b8..4463e3280df 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -623,11 +623,9 @@ def extract( "unsupported value for `flags` parameter" ) - data, _ = libstrings.extract(self._column, pat, flags) + data = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: - data = next(iter(data.values())) - else: - data = data + _, data = data.popitem() return self._return_or_inplace(data, expand=expand) def contains( diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 09b0f453692..bc093fdaa9a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: self.set_by_label(key, value) def __delitem__(self, key: abc.Hashable) -> None: - old_ncols = len(self._data) + old_ncols = len(self) del self._data[key] - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def __len__(self) -> int: @@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]: @property def nlevels(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 if not self.multiindex: return 1 @@ -226,7 +226,7 @@ def name(self) -> abc.Hashable: @cached_property def nrows(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 else: return len(next(iter(self.values()))) @@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: Parameters ---------- old_ncols: int - len(self._data) before self._data was modified + len(self) before self._data was modified new_ncols: int - len(self._data) after self._data was modified + len(self) after self._data was modified """ cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: @@ -335,7 +335,7 @@ def insert( if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") - old_ncols = len(self._data) + old_ncols = len(self) if loc == -1: loc = old_ncols elif not (0 <= loc <= old_ncols): @@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple: tuple """ if isinstance(index, slice): - start, stop, step = index.indices(len(self._data)) + start, stop, step = index.indices(len(self)) return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) @@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: if len(self) > 0 and len(value) != self.nrows: raise ValueError("All columns must be of equal length") - old_ncols = len(self._data) + old_ncols = len(self) self._data[key] = value - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def _select_by_label_list_like(self, key: tuple) -> Self: @@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None: if level < 0: level += self.nlevels - old_ncols = len(self._data) + old_ncols = len(self) self._data = { _remove_key_level(key, level): value # type: ignore[arg-type] for key, value in self._data.items() } - new_ncols = len(self._data) + new_ncols = len(self) self._level_names = ( self._level_names[:level] + self._level_names[level + 1 :] ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d73ad8225ca..16b0aa95c35 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg): return False @_performance_tracking - def _downcast_to_series(self, df, arg): + def _downcast_to_series(self, df: DataFrame, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules @@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg): # take series along the axis: if axis == 1: - return df[df._data.names[0]] + return df[df._column_names[0]] else: if df._num_columns > 0: dtypes = df.dtypes.values.tolist() normalized_dtype = np.result_type(*dtypes) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: df[name] = col.astype(normalized_dtype) sr = df.T - return sr[sr._data.names[0]] + return sr[sr._column_names[0]] class _DataFrameLocIndexer(_DataFrameIndexer): @@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg): and len(arg) > 1 and is_scalar(arg[1]) ): - return result._data.columns[0].element_indexing(0) + return result._columns[0].element_indexing(0) return result else: if isinstance(arg[0], slice): @@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg): else: tmp_col_name = str(uuid4()) cantor_name = "_" + "_".join( - map(str, columns_df._data.names) + map(str, columns_df._column_names) ) if columns_df._data.multiindex: # column names must be appropriate length tuples @@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value): else column.column_empty_like( col, masked=True, newsize=length ) - for key, col in self._data.items() + for key, col in self._column_labels_and_values ) self._data = self._data._from_columns_like_self( new_columns, verify=False @@ -1494,8 +1494,8 @@ def __delitem__(self, name): @_performance_tracking def memory_usage(self, index=True, deep=False) -> cudf.Series: - mem_usage = [col.memory_usage for col in self._data.columns] - names = [str(name) for name in self._data.names] + mem_usage = [col.memory_usage for col in self._columns] + names = [str(name) for name in self._column_names] if index: mem_usage.append(self.index.memory_usage()) names.append("Index") @@ -1725,7 +1725,7 @@ def _concat( [] if are_all_range_index or (ignore_index and not empty_has_index) - else list(f.index._data.columns) + else list(f.index._columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs @@ -1808,7 +1808,7 @@ def _concat( out.index.dtype, cudf.CategoricalDtype ): out = out.set_index(out.index) - for name, col in out._data.items(): + for name, col in out._column_labels_and_values: out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype ) @@ -1831,13 +1831,13 @@ def astype( errors: Literal["raise", "ignore"] = "raise", ): if is_dict_like(dtype): - if len(set(dtype.keys()) - set(self._data.names)) > 0: + if len(set(dtype.keys()) - set(self._column_names)) > 0: raise KeyError( "Only a column name can be used for the " "key in a dtype mappings argument." ) else: - dtype = {cc: dtype for cc in self._data.names} + dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) def _clean_renderable_dataframe(self, output): @@ -2601,7 +2601,7 @@ def equals(self, other) -> bool: # If all other checks matched, validate names. if ret: for self_name, other_name in zip( - self._data.names, other._data.names + self._column_names, other._column_names ): if self_name != other_name: ret = False @@ -2676,7 +2676,7 @@ def columns(self, columns): ) self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._data.columns)), + data=dict(zip(pd_columns, self._columns)), multiindex=multiindex, level_names=level_names, label_dtype=label_dtype, @@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: f"got {len(self)} elements" ) self._data = ColumnAccessor( - data=dict(zip(other.names, self._data.columns)), + data=dict(zip(other.names, self._columns)), multiindex=other.multiindex, rangeindex=other.rangeindex, level_names=other.level_names, @@ -2983,7 +2983,7 @@ def set_index( elif isinstance(col, (MultiIndex, pd.MultiIndex)): if isinstance(col, pd.MultiIndex): col = MultiIndex.from_pandas(col) - data_to_add.extend(col._data.columns) + data_to_add.extend(col._columns) names.extend(col.names) elif isinstance( col, (cudf.Series, cudf.Index, pd.Series, pd.Index) @@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): ) out = [] - for (name, col), other_col in zip(self._data.items(), other_cols): + for (name, col), other_col in zip( + self._column_labels_and_values, other_cols + ): source_col, other_col = _check_and_cast_columns_with_other( source_col=col, other=other_col, @@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): column.column_empty_like( col_data, masked=True, newsize=length ) - for col_data in self._data.values() + for col_data in self._columns ), verify=False, ) @@ -3664,7 +3666,7 @@ def rename( name: col.find_and_replace( to_replace, vals, is_all_na ) - for name, col in self.index._data.items() + for name, col in self.index._column_labels_and_values } ) except OverflowError: @@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - prefix + col_name for col_name in list(self._data.keys()) - ] + out.columns = [prefix + col_name for col_name in self._column_names] return out @_performance_tracking @@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - col_name + suffix for col_name in list(self._data.keys()) - ] + out.columns = [col_name + suffix for col_name in self._column_names] return out @_performance_tracking @@ -4805,7 +4803,7 @@ def _func(x): # pragma: no cover # TODO: naive implementation # this could be written as a single kernel result = {} - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: apply_sr = Series._from_column(col) result[name] = apply_sr.apply(_func)._column @@ -5444,7 +5442,7 @@ def to_pandas( out_index = self.index.to_pandas() out_data = { i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) - for i, col in enumerate(self._data.columns) + for i, col in enumerate(self._columns) } out_df = pd.DataFrame(out_data, index=out_index) @@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table: index = index._as_int_index() index.name = "__index_level_0__" if isinstance(index, MultiIndex): - index_descr = list(index._data.names) + index_descr = index._column_names index_levels = index.levels else: index_descr = ( index.names if index.name is not None else ("index",) ) data = data.copy(deep=False) - for gen_name, col_name in zip(index_descr, index._data.names): + for gen_name, col_name in zip( + index_descr, index._column_names + ): data._insert( data.shape[1], gen_name, @@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[self[col] for col in self._data.names], + columns_to_convert=[self[col] for col in self._column_names], df=self, column_names=out.schema.names, index_levels=index_levels, @@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): "column_dtypes is currently not supported." ) members = [("index", self.index.dtype)] if index else [] - members += [(col, self[col].dtype) for col in self._data.names] + members += list(self._dtypes) dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_numpy() - for col in self._data.names: + for col in self._column_names: ret[col] = self[col].to_numpy() return ret @@ -6059,7 +6059,7 @@ def quantile( ) if columns is None: - columns = data_df._data.names + columns = set(data_df._column_names) if isinstance(q, numbers.Number): q_is_number = True @@ -6084,7 +6084,7 @@ def quantile( # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" result = {} - for k in data_df._data.names: + for k in data_df._column_names: if k in columns: ser = data_df[k] res = ser.quantile( @@ -6198,7 +6198,7 @@ def make_false_column_like_self(): if isinstance(values, DataFrame) else {name: values._column for name in self._data} ) - for col, self_col in self._data.items(): + for col, self_col in self._column_labels_and_values: if col in other_cols: other_col = other_cols[col] self_is_cat = isinstance(self_col, CategoricalColumn) @@ -6231,13 +6231,13 @@ def make_false_column_like_self(): else: result[col] = make_false_column_like_self() elif is_dict_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: if name in values: result[name] = col.isin(values[name]) else: result[name] = make_false_column_like_self() elif is_list_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: result[name] = col.isin(values) else: raise TypeError( @@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable else as_column(True, length=len(filtered._data[name])) - for name in filtered._data.names + for name in filtered._column_names } ) mask = mask.all(axis=1) @@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False): length = len(self) return Series._from_column( as_column([length - col.null_count for col in self._columns]), - index=cudf.Index(self._data.names), + index=cudf.Index(self._column_names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6409,7 +6409,7 @@ def _reduce( return source._apply_cupy_method_axis_1(op, **kwargs) else: axis_0_results = [] - for col_label, col in source._data.items(): + for col_label, col in source._column_labels_and_values: try: axis_0_results.append(getattr(col, op)(**kwargs)) except AttributeError as err: @@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna, numeric_only ) - for col in prepared._data.names: + for col in prepared._column_names: if prepared._data[col].nullable: prepared._data[col] = ( prepared._data[col] @@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._data.items(): + for k, col in self._column_labels_and_values: infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: df._insert(len(df._data), k, col) @@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` column_idx_df = pd.DataFrame( - data=range(len(self._data)), index=named_levels + data=range(self._num_columns), index=named_levels ) column_indices: list[list[int]] = [] @@ -7392,17 +7392,17 @@ def to_struct(self, name=None): ----- Note: a copy of the columns is made. """ - if not all(isinstance(name, str) for name in self._data.names): + if not all(isinstance(name, str) for name in self._column_names): warnings.warn( "DataFrame contains non-string column name(s). Struct column " "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - fields = {str(name): col.dtype for name, col in self._data.items()} + fields = {str(name): dtype for name, dtype in self._dtypes} col = StructColumn( data=None, dtype=cudf.StructDtype(fields=fields), - children=tuple(col.copy(deep=True) for col in self._data.columns), + children=tuple(col.copy(deep=True) for col in self._columns), size=len(self), offset=0, ) @@ -7984,7 +7984,7 @@ def value_counts( diff = set(subset) - set(self._data) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") - columns = list(self._data.names) if subset is None else subset + columns = list(self._column_names) if subset is None else subset result = ( self.groupby( by=columns, @@ -8105,7 +8105,7 @@ def func(left, right, output): right._column_names ) elif _is_scalar_or_zero_d_array(right): - for name, col in output._data.items(): + for name, col in output._column_labels_and_values: output._data[name] = col.fillna(value) return output else: @@ -8387,7 +8387,7 @@ def extract_col(df, col): and col not in df.index._data and not isinstance(df.index, MultiIndex) ): - return df.index._data.columns[0] + return df.index._column return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b2bc85b13b..98af006f6e5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property - def _dtypes(self) -> abc.Iterable: - return zip(self._data.names, (col.dtype for col in self._data.columns)) + def _column_labels_and_values( + self, + ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]: + return zip(self._column_names, self._columns) + + @property + def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]: + for label, col in self._column_labels_and_values: + yield label, col.dtype @property def ndim(self) -> int: @@ -87,7 +94,7 @@ def serialize(self): # TODO: See if self._data can be serialized outright header = { "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(tuple(self._data.names)), + "column_names": pickle.dumps(self._column_names), "column_rangeindex": pickle.dumps(self._data.rangeindex), "column_multiindex": pickle.dumps(self._data.multiindex), "column_label_dtype": pickle.dumps(self._data.label_dtype), @@ -156,7 +163,7 @@ def _mimic_inplace( self, result: Self, inplace: bool = False ) -> Self | None: if inplace: - for col in self._data: + for col in self._column_names: if col in result._data: self._data[col]._mimic_inplace( result._data[col], inplace=True @@ -267,7 +274,7 @@ def __len__(self) -> int: def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: casted = ( col.astype(dtype.get(col_name, col.dtype), copy=copy) - for col_name, col in self._data.items() + for col_name, col in self._column_labels_and_values ) ca = self._data._from_columns_like_self(casted, verify=False) return self._from_data_like_self(ca) @@ -338,9 +345,7 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip( - self._data.values(), other._data.values() - ) + for self_col, other_col in zip(self._columns, other._columns) ) @_performance_tracking @@ -434,11 +439,9 @@ def to_array( if dtype is None: if ncol == 1: - dtype = next(iter(self._data.values())).dtype + dtype = next(self._dtypes)[1] else: - dtype = find_common_type( - [col.dtype for col in self._data.values()] - ) + dtype = find_common_type([dtype for _, dtype in self._dtypes]) if not isinstance(dtype, numpy.dtype): raise NotImplementedError( @@ -446,12 +449,12 @@ def to_array( ) if self.ndim == 1: - return to_array(self._data.columns[0], dtype) + return to_array(self._columns[0], dtype) else: matrix = module.empty( shape=(len(self), ncol), dtype=dtype, order="F" ) - for i, col in enumerate(self._data.values()): + for i, col in enumerate(self._columns): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more # suitable error. @@ -751,7 +754,7 @@ def fillna( filled_columns = [ col.fillna(value[name], method) if name in value else col.copy() - for name, col in self._data.items() + for name, col in self._column_labels_and_values ] return self._mimic_inplace( @@ -988,7 +991,10 @@ def to_arrow(self): index: [[1,2,3]] """ return pa.Table.from_pydict( - {str(name): col.to_arrow() for name, col in self._data.items()} + { + str(name): col.to_arrow() + for name, col in self._column_labels_and_values + } ) @_performance_tracking @@ -1012,7 +1018,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: See `ColumnBase._with_type_metadata` for more information. """ - for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes): + for (name, col), (_, dtype) in zip( + self._column_labels_and_values, other._dtypes + ): self._data.set_by_label(name, col._with_type_metadata(dtype)) return self @@ -1422,7 +1430,7 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ + libcudf.copying.columns_split(list(self._columns), splits)[ split_idx ], self._column_names, @@ -1432,7 +1440,7 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode([*self._columns]) + columns, indices = libcudf.transform.table_encode(list(self._columns)) keys = self._from_columns_like_self(columns) return keys, indices @@ -1578,7 +1586,7 @@ def __neg__(self): col.unary_operator("not") if col.dtype.kind == "b" else -1 * col - for col in self._data.columns + for col in self._columns ) ) ) @@ -1840,9 +1848,7 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - self._data._from_columns_like_self( - (~col for col in self._data.columns) - ) + self._data._from_columns_like_self((~col for col in self._columns)) ) @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6424c8af877..cb8cd0cd28b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list( - self.grouping.keys.drop_duplicates()._data.columns - ) - right_cols = list(result_index._data.columns) + left_cols = list(self.grouping.keys.drop_duplicates()._columns) + right_cols = list(result_index._columns) join_keys = [ _match_join_keys(lcol, rcol, "left") for lcol, rcol in zip(left_cols, right_cols) @@ -1483,7 +1481,7 @@ def _post_process_chunk_results( # the column name should be, especially if we applied # a nameless UDF. result = result.to_frame( - name=grouped_values._data.names[0] + name=grouped_values._column_names[0] ) else: index_data = group_keys._data.copy(deep=True) @@ -1632,7 +1630,7 @@ def mult(df): if func in {"sum", "product"}: # For `sum` & `product`, boolean types # will need to result in `int64` type. - for name, col in res._data.items(): + for name, col in res._column_labels_and_values: if col.dtype.kind == "b": res._data[name] = col.astype("int") return res @@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): def _reduce_numeric_only(self, op: str): columns = list( name - for name in self.obj._data.names - if ( - is_numeric_dtype(self.obj._data[name].dtype) - and name not in self.grouping.names - ) + for name, dtype in self.obj._dtypes + if (is_numeric_dtype(dtype) and name not in self.grouping.names) ) return self[columns].agg(op) @@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame: """ # If the key columns are in `obj`, filter them out value_column_names = [ - x for x in self._obj._data.names if x not in self._named_columns + x for x in self._obj._column_names if x not in self._named_columns ] value_columns = self._obj._data.select_by_label(value_column_names) return self._obj.__class__._from_data(value_columns) @@ -3224,8 +3219,8 @@ def _handle_series(self, by): self.names.append(by.name) def _handle_index(self, by): - self._key_columns.extend(by._data.columns) - self.names.extend(by._data.names) + self._key_columns.extend(by._columns) + self.names.extend(by._column_names) def _handle_mapping(self, by): by = cudf.Series(by.values(), index=by.keys()) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b2bd20c4982..cd07c58c5d9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -122,13 +122,13 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="right", ascending=sort_vals.is_monotonic_increasing, @@ -286,6 +286,20 @@ def name(self): def name(self, value): self._name = value + @property + @_performance_tracking + def _column_names(self) -> tuple[Any]: + return (self.name,) + + @property + @_performance_tracking + def _columns(self) -> tuple[ColumnBase]: + return (self._values,) + + @property + def _column_labels_and_values(self) -> Iterable: + return zip(self._column_names, self._columns) + @property # type: ignore @_performance_tracking def start(self) -> int: @@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } data = self._apply_cupy_ufunc_to_operands( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fd6bf37f0e6..810d4ad74e7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -294,7 +294,7 @@ def _num_rows(self) -> int: @property def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? - return self.index._data.names + return self.index._column_names @classmethod def _from_data( @@ -307,6 +307,7 @@ def _from_data( raise ValueError( f"index must be None or a cudf.Index not {type(index).__name__}" ) + # out._num_rows requires .index to be defined out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -882,7 +883,7 @@ def replace( columns_dtype_map=dict(self._dtypes), ) copy_data = [] - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: try: replaced = col.find_and_replace( to_replace_per_column[name], @@ -2703,11 +2704,11 @@ def sort_index( by.extend( filter( lambda n: n not in handled, - self.index._data.names, + self.index._column_names, ) ) else: - by = list(idx._data.names) + by = list(idx._column_names) inds = idx._get_sorted_inds( by=by, ascending=ascending, na_position=na_position @@ -3013,7 +3014,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: columns_to_slice = [ *( - self.index._data.columns + self.index._columns if keep_index and not has_range_index else [] ), @@ -3210,7 +3211,7 @@ def _empty_like(self, keep_index=True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ - *(self.index._data.columns if keep_index else ()), + *(self.index._columns if keep_index else ()), *self._columns, ] ), @@ -3227,7 +3228,7 @@ def _split(self, splits, keep_index=True): columns_split = libcudf.copying.columns_split( [ - *(self.index._data.columns if keep_index else []), + *(self.index._columns if keep_index else []), *self._columns, ], splits, @@ -3763,8 +3764,8 @@ def _reindex( idx_dtype_match = (df.index.nlevels == index.nlevels) and all( _is_same_dtype(left_dtype, right_dtype) for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), + (dtype for _, dtype in df.index._dtypes), + (dtype for _, dtype in index._dtypes), ) ) @@ -3783,7 +3784,7 @@ def _reindex( (name or 0) if isinstance(self, cudf.Series) else name: col - for name, col in df._data.items() + for name, col in df._column_labels_and_values }, index=df.index, ) @@ -3794,7 +3795,7 @@ def _reindex( index = index if index is not None else df.index if column_names is None: - names = list(df._data.names) + names = list(df._column_names) level_names = self._data.level_names multiindex = self._data.multiindex rangeindex = self._data.rangeindex @@ -3948,7 +3949,7 @@ def round(self, decimals=0, how="half_even"): col.round(decimals[name], how=how) if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) - for name, col in self._data.items() + for name, col in self._column_labels_and_values ) return self._from_data_like_self( self._data._from_columns_like_self(cols) @@ -4270,7 +4271,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count @@ -4305,7 +4306,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - [*self.index._data.columns, *data_columns], + [*self.index._columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -4853,7 +4854,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # This works for Index too inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } index = self.index @@ -4933,7 +4934,7 @@ def repeat(self, repeats, axis=None): """ res = self._from_columns_like_self( Frame._repeat( - [*self.index._data.columns, *self._columns], repeats, axis + [*self.index._columns, *self._columns], repeats, axis ), self._column_names, self._index_names, @@ -6224,7 +6225,7 @@ def _preprocess_subset(self, subset): not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) - and subset in self._data.names + and subset in self._column_names ): subset = (subset,) diff = set(subset) - set(self._data) @@ -6306,8 +6307,8 @@ def rank( ) numeric_cols = ( name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) + for name, dtype in self._dtypes + if _is_non_decimal_numeric_dtype(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index b65bc7af832..cfeaca00888 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -140,11 +140,15 @@ def __init__( # right_on. self._using_left_index = bool(left_index) left_on = ( - lhs.index._data.names if left_index else left_on if left_on else on + lhs.index._column_names + if left_index + else left_on + if left_on + else on ) self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names + rhs.index._column_names if right_index else right_on if right_on @@ -334,18 +338,18 @@ def _merge_results( # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set( - right_result._data.names + common_names = set(left_result._column_names) & set( + right_result._column_names ) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col - for name, col in left_result._data.items() + for name, col in left_result._column_labels_and_values } # The right table follows the same rule as the left table except that # key columns from the right table are removed. - for name, col in right_result._data.items(): + for name, col in right_result._column_labels_and_values: if name in common_names: if name not in self._key_columns_with_same_name: data[f"{name}{self.rsuffix}"] = col @@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # producing the input result. by: list[Any] = [] if self._using_left_index and self._using_right_index: - by.extend(result.index._data.columns) + by.extend(result.index._columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e00890ac5c3..6de3981ba66 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator + from collections.abc import Generator, Hashable from typing_extensions import Self @@ -233,8 +233,8 @@ def names(self, value): # to unexpected behavior in some cases. This is # definitely buggy, but we can't disallow non-unique # names either... - self._data = self._data.__class__( - dict(zip(value, self._data.values())), + self._data = type(self._data)( + dict(zip(value, self._columns)), level_names=self._data.level_names, verify=False, ) @@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False): @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" - lookup = cudf.DataFrame() + lookup_dict = {} for i, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[i] = cudf.Series(row) - frame = cudf.DataFrame(dict(enumerate(index._data.columns))) + lookup_dict[i] = row + lookup = cudf.DataFrame(lookup_dict) + frame = cudf.DataFrame._from_data( + ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) data_table = cudf.concat( [ frame, cudf.DataFrame._from_data( - {"idx": column.as_column(range(len(frame)))} + ColumnAccessor( + {"idx": column.as_column(range(len(frame)))}, + verify=False, + ) ), ], axis=1, @@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # TODO: Remove this after merge/join # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): - lookup_order = "_" + "_".join(map(str, lookup._data.names)) + lookup_order = "_" + "_".join(map(str, lookup._column_names)) lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] @@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_column(index._data.columns[k]), + cudf.Series._from_column(index._columns[k]), ) # determine if we should downcast from a DataFrame to a Series @@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key): ) if need_downcast: result = result.T - return result[result._data.names[0]] + return result[result._column_names[0]] if len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column result = cudf.Series._from_data( - {}, name=tuple(col[0] for col in index._data.columns) + {}, name=tuple(col[0] for col in index._columns) ) elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - *_, last_column = index._data.columns + last_column = index._columns[-1] out_index = cudf.Index._from_column( last_column, name=index.names[-1] ) @@ -894,7 +900,7 @@ def __eq__(self, other): [ self_col.equals(other_col) for self_col, other_col in zip( - self._data.values(), other._data.values() + self._columns, other._columns ) ] ) @@ -1041,9 +1047,11 @@ def to_frame( ) @_performance_tracking - def get_level_values(self, level) -> cudf.Index: + def _level_to_ca_label(self, level) -> tuple[Hashable, int]: """ - Return the values at the requested level + Convert a level to a ColumAccessor label and an integer position. + + Useful if self._column_names != self.names. Parameters ---------- @@ -1051,10 +1059,13 @@ def get_level_values(self, level) -> cudf.Index: Returns ------- - An Index containing the values at the requested level. + tuple[Hashable, int] + (ColumnAccessor label corresponding to level, integer position of the level) """ - colnames = self._data.names - if level not in colnames: + colnames = self._column_names + try: + level_idx = colnames.index(level) + except ValueError: if isinstance(level, int): if level < 0: level = level + len(colnames) @@ -1067,8 +1078,22 @@ def get_level_values(self, level) -> cudf.Index: level = colnames[level_idx] else: raise KeyError(f"Level not found: '{level}'") - else: - level_idx = colnames.index(level) + return level, level_idx + + @_performance_tracking + def get_level_values(self, level) -> cudf.Index: + """ + Return the values at the requested level + + Parameters + ---------- + level : int or label + + Returns + ------- + An Index containing the values at the requested level. + """ + level, level_idx = self._level_to_ca_label(level) level_values = cudf.Index._from_column( self._data[level], name=self.names[level_idx] ) @@ -1420,57 +1445,6 @@ def from_arrays( codes=codes, levels=levels, sortorder=sortorder, names=names ) - @_performance_tracking - def _poplevels(self, level) -> None | MultiIndex | cudf.Index: - """ - Remove and return the specified levels from self. - - Parameters - ---------- - level : level name or index, list - One or more levels to remove - - Returns - ------- - Index composed of the removed levels. If only a single level - is removed, a flat index is returned. If no levels are specified - (empty list), None is returned. - """ - if not pd.api.types.is_list_like(level): - level = (level,) - - ilevels = sorted(self._level_index_from_level(lev) for lev in level) - - if not ilevels: - return None - - popped_data = {} - popped_names = [] - names = list(self.names) - - # build the popped data and names - for i in ilevels: - n = self._data.names[i] - popped_data[n] = self._data[n] - popped_names.append(self.names[i]) - - # pop the levels out from self - # this must be done iterating backwards - for i in reversed(ilevels): - n = self._data.names[i] - names.pop(i) - popped_data[n] = self._data.pop(n) - - # construct the popped result - popped = cudf.core.index._index_from_data(popped_data) - popped.names = popped_names - - # update self - self.names = names - self._levels, self._codes = _compute_levels_and_codes(self._data) - - return popped - @_performance_tracking def swaplevel(self, i=-2, j=-1) -> Self: """ @@ -1507,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self: ('aa', 'b')], ) """ - name_i = self._data.names[i] if isinstance(i, int) else i - name_j = self._data.names[j] if isinstance(j, int) else j + name_i = self._column_names[i] if isinstance(i, int) else i + name_j = self._column_names[j] if isinstance(j, int) else j new_data = {} - for k, v in self._data.items(): + for k, v in self._column_labels_and_values: if k not in (name_i, name_j): new_data[k] = v elif k == name_i: @@ -1523,7 +1497,7 @@ def swaplevel(self, i=-2, j=-1) -> Self: return midx @_performance_tracking - def droplevel(self, level=-1) -> MultiIndex | cudf.Index: + def droplevel(self, level=-1) -> Self | cudf.Index: """ Removes the specified levels from the MultiIndex. @@ -1578,11 +1552,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index: >>> idx.droplevel(["first", "second"]) Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ - mi = self.copy(deep=False) - mi._poplevels(level) - if mi.nlevels == 1: - return mi.get_level_values(mi.names[0]) + if is_scalar(level): + level = (level,) + elif len(level) == 0: + return self + + new_names = list(self.names) + new_data = self._data.copy(deep=False) + for i in sorted( + (self._level_index_from_level(lev) for lev in level), reverse=True + ): + new_names.pop(i) + new_data.pop(self._data.names[i]) + + if len(new_data) == 1: + return cudf.core.index._index_from_data(new_data) else: + mi = MultiIndex._from_data(new_data) + mi.names = new_names return mi @_performance_tracking @@ -1886,7 +1873,7 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - def _level_index_from_level(self, level): + def _level_index_from_level(self, level) -> int: """ Return level index from given level name or index """ @@ -1935,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._data.columns, self._data.columns) + for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) scatter_map, indices = libcudf.join.join( @@ -2132,7 +2119,7 @@ def _split_columns_by_levels( lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + for i, (name, col) in enumerate(zip(self.names, self._columns)): if in_levels and i in level_indices: name = f"level_{i}" if name is None else name yield name, col @@ -2173,9 +2160,7 @@ def _columns_for_reset_index( ) -> Generator[tuple[Any, column.ColumnBase], None, None]: """Return the columns and column names for .reset_index""" if levels is None: - for i, (col, name) in enumerate( - zip(self._data.columns, self.names) - ): + for i, (col, name) in enumerate(zip(self._columns, self.names)): yield f"level_{i}" if name is None else name, col else: yield from self._split_columns_by_levels(levels, in_levels=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c026579b8b5..401fef67ee6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,6 +12,7 @@ from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default +from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column_accessor import ColumnAccessor @@ -409,7 +410,7 @@ def concat( result_columns = None if keys_objs is None: for o in objs: - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: if name in result_data: raise NotImplementedError( f"A Column with duplicate name found: {name}, cuDF " @@ -437,7 +438,7 @@ def concat( else: # All levels in the multiindex label must have the same type has_multiple_level_types = ( - len({type(name) for o in objs for name in o._data.keys()}) > 1 + len({type(name) for o in objs for name in o._column_names}) > 1 ) if has_multiple_level_types: raise NotImplementedError( @@ -446,7 +447,7 @@ def concat( "the labels to the same type." ) for k, o in zip(keys_objs, objs): - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it # to handle cases where dfs and srs are concatenated @@ -842,7 +843,7 @@ def get_dummies( else: result_data = { col_name: col - for col_name, col in data._data.items() + for col_name, col in data._column_labels_and_values if col_name not in columns } @@ -942,7 +943,7 @@ def _merge_sorted( columns = [ [ - *(obj.index._data.columns if not ignore_index else ()), + *(obj.index._columns if not ignore_index else ()), *obj._columns, ] for obj in objs @@ -984,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._data.items(): + for col_label, col in df._column_labels_and_values: names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1008,7 +1009,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._data.names, + level_names=(None,) + columns._column_names, verify=False, ) return cudf.DataFrame._from_data( @@ -1086,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): # Create a DataFrame composed of columns from both # columns and index ca = ColumnAccessor( - dict( - enumerate( - itertools.chain(index._data.columns, columns._data.columns) - ) - ), + dict(enumerate(itertools.chain(index._columns, columns._columns))), verify=False, ) columns_index = cudf.DataFrame._from_data(ca) @@ -1227,13 +1224,24 @@ def unstack(df, level, fill_value=None, sort: bool = True): ) return res else: - df = df.copy(deep=False) - columns = df.index._poplevels(level) - index = df.index - result = _pivot(df, index, columns) - if result.index.nlevels == 1: - result.index = result.index.get_level_values(result.index.names[0]) - return result + index = df.index.droplevel(level) + if is_scalar(level): + columns = df.index.get_level_values(level) + else: + new_names = [] + ca_data = {} + for lev in level: + ca_level, level_idx = df.index._level_to_ca_label(lev) + new_names.append(df.index.names[level_idx]) + ca_data[ca_level] = df.index._data[ca_level] + columns = type(df.index)._from_data( + ColumnAccessor(ca_data, verify=False) + ) + columns.names = new_names + result = _pivot(df, index, columns) + if result.index.nlevels == 1: + result.index = result.index.get_level_values(result.index.names[0]) + return result def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase: @@ -1548,7 +1556,7 @@ def pivot_table( if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] table_columns = tuple( - map(lambda column: column[1:], table._data.names) + map(lambda column: column[1:], table._column_names) ) table.columns = pd.MultiIndex.from_tuples( tuples=table_columns, names=column_names diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 7197560b5a4..68f34fa28ff 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -186,7 +186,7 @@ def to_datetime( if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] - req = list(set(required) - set(arg._data.names)) + req = list(set(required) - set(arg._column_names)) if len(req): err_req = ",".join(req) raise ValueError( @@ -196,7 +196,7 @@ def to_datetime( ) # replace passed column name with values in _unit_map - got_units = {k: get_units(k) for k in arg._data.names} + got_units = {k: get_units(k) for k in arg._column_names} unit_rev = {v: k for k, v in got_units.items()} # keys we don't recognize diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 265b87350ae..3af662b62ea 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - if any(col.has_nulls() for col in frame._data.values()): + if any(col.has_nulls() for col in frame._columns): return False np_field_types = np.dtype( list( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 6d7362952c9..bfe716f0afc 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - if str(col.dtype) in supported_types - else np.dtype("O") - for colname, col in frame._data.items() + colname: dtype if str(dtype) in supported_types else np.dtype("O") + for colname, dtype in frame._dtypes } def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - for colname, col in frame._data.items() - if str(col.dtype) in supported_types + colname: dtype + for colname, dtype in frame._dtypes + if str(dtype) in supported_types } def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col - for colname, col in frame._data.items() + for colname, col in frame._column_labels_and_values if str(col.dtype) in supported_types } @@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): *cudautils.make_cache_key( func, tuple(_all_dtypes_from_frame(frame).values()) ), - *(col.mask is None for col in frame._data.values()), - *frame._data.keys(), + *(col.mask is None for col in frame._columns), + *frame._column_names, scalar_argtypes, suffix, ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a9c20150930..3dc8915bfd1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -186,13 +186,13 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) - for col in df._data.columns: - if isinstance(col, cudf.core.column.ListColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.ListDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "list columns." ) - elif isinstance(col, cudf.core.column.StructColumn): + elif isinstance(dtype, cudf.StructDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "Struct columns." @@ -203,12 +203,11 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) - for col in df._data.columns + isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) - for col_name, col in df._data.items(): - if isinstance(col, cudf.core.column.CategoricalColumn): + for col_name, col in df._column_labels_and_values: + if isinstance(col.dtype, cudf.CategoricalDtype): df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 1347b2cc38f..fe8e446f9c0 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -79,13 +79,13 @@ def to_dlpack(cudf_obj): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype) - for col in gdf._data.columns + not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + for _, dtype in gdf._dtypes ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [col.dtype for col in gdf._data.columns] + [dtype for _, dtype in gdf._dtypes] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index fd246c6215f..c54293badbe 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -396,8 +396,8 @@ def to_orc( ): """{docstring}""" - for col in df._data.columns: - if isinstance(col, cudf.core.column.CategoricalColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 0dbe8bc758b..0c1cda8810b 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -887,6 +887,20 @@ class ProxyFallbackError(Exception): pass +def _fast_function_call(): + """ + Placeholder fast function for pytest profiling purposes. + """ + return None + + +def _slow_function_call(): + """ + Placeholder slow function for pytest profiling purposes. + """ + return None + + def _fast_slow_function_call( func: Callable, /, @@ -916,6 +930,7 @@ def _fast_slow_function_call( # try slow path raise Exception() fast = True + _fast_function_call() if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): try: with nvtx.annotate( @@ -962,6 +977,7 @@ def _fast_slow_function_call( from ._logger import log_fallback log_fallback(slow_args, slow_kwargs, err) + _slow_function_call() with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 505a40b0bfa..d12d2697729 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -1,10 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import contextlib +import json import os import sys +import traceback +from collections import defaultdict from functools import wraps import pytest @@ -36,4 +39,58 @@ def patch_testing_functions(): pytest.raises = replace_kwargs({"match": None})(pytest.raises) +# Dictionary to store function call counts +function_call_counts = {} # type: ignore + +# The specific functions to track +FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"} + + +def find_pytest_file(frame): + stack = traceback.extract_stack() + absolute_paths = [frame.filename for frame in stack] + for file in absolute_paths: + if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[ + -1 + ].startswith("test_"): + return str(file).rsplit("pandas-tests/", 1)[-1] + return None + + +def trace_calls(frame, event, arg): + if event != "call": + return + code = frame.f_code + func_name = code.co_name + + if func_name in FUNCTION_NAME: + filename = find_pytest_file(frame) + if filename is None: + return + if filename not in function_call_counts: + function_call_counts[filename] = defaultdict(int) + function_call_counts[filename][func_name] += 1 + + +def pytest_sessionstart(session): + # Set the profile function to trace calls + sys.setprofile(trace_calls) + + +def pytest_sessionfinish(session, exitstatus): + # Remove the profile function + sys.setprofile(None) + + +@pytest.hookimpl(trylast=True) +def pytest_unconfigure(config): + if hasattr(config, "workerinput"): + # Running in xdist worker, write the counts before exiting + worker_id = config.workerinput["workerid"] + output_file = f"function_call_counts_worker_{worker_id}.json" + with open(output_file, "w") as f: + json.dump(function_call_counts, f, indent=4) + print(f"Function call counts have been written to {output_file}") + + sys.path.append(os.path.dirname(__file__)) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 9c65b74d081..9b9ce026571 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -64,8 +64,6 @@ markers = [ "skip_ubsan: Tests known to fail UBSAN check", ] EOF - # append the contents of patch-confest.py to conftest.py - cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py # Substitute `pandas.tests` with a relative import. # This will depend on the location of the test module relative to @@ -137,7 +135,7 @@ and not test_eof_states \ and not test_array_tz" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI -PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ +PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ --import-mode=importlib \ @@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ mv *.json .. cd .. - rm -rf pandas-testing/pandas-tests/ diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index ffd2abb960d..4ea0b3b4413 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -12,7 +12,9 @@ """ import argparse +import glob import json +import os from rich.console import Console from rich.table import Table @@ -57,6 +59,44 @@ def get_per_module_results(log_file_name): per_module_results[module_name].setdefault(outcome, 0) per_module_results[module_name]["total"] += 1 per_module_results[module_name][outcome] += 1 + + directory = os.path.dirname(log_file_name) + pattern = os.path.join(directory, "function_call_counts_worker_*.json") + matching_files = glob.glob(pattern) + function_call_counts = {} + + for file in matching_files: + with open(file) as f: + function_call_count = json.load(f) + if not function_call_counts: + function_call_counts.update(function_call_count) + else: + for key, value in function_call_count.items(): + if key not in function_call_counts: + function_call_counts[key] = value + else: + if "_slow_function_call" not in function_call_counts[key]: + function_call_counts[key]["_slow_function_call"] = 0 + if "_fast_function_call" not in function_call_counts[key]: + function_call_counts[key]["_fast_function_call"] = 0 + function_call_counts[key]["_slow_function_call"] += ( + value.get("_slow_function_call", 0) + ) + function_call_counts[key]["_fast_function_call"] += ( + value.get("_fast_function_call", 0) + ) + + for key, value in per_module_results.items(): + if key in function_call_counts: + per_module_results[key]["_slow_function_call"] = ( + function_call_counts[key].get("_slow_function_call", 0) + ) + per_module_results[key]["_fast_function_call"] = ( + function_call_counts[key].get("_fast_function_call", 0) + ) + else: + per_module_results[key]["_slow_function_call"] = 0 + per_module_results[key]["_fast_function_call"] = 0 return per_module_results diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 31ad24a4664..668e7a77454 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -676,7 +676,7 @@ def assert_frame_equal( if check_like: left, right = left.reindex(index=right.index), right - right = right[list(left._data.names)] + right = right[list(left._column_names)] # index comparison assert_index_equal( diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index 2136bca0e28..8a594794fac 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -14,3 +14,4 @@ filterwarnings = ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning +addopts = --tb=native diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b1e095e8853..c41be3e4428 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): mi1 = gdf.groupby(["Date", "Symbol"]).mean().index mi2 = mi1.copy(deep=deep) - lchildren = [col.children for _, col in mi1._data.items()] - rchildren = [col.children for _, col in mi2._data.items()] + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] # Flatten lchildren = reduce(operator.add, lchildren) @@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() - ] - rptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() - ] + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini index 817d98e6ba2..98459035298 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini @@ -1,3 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + [pytest] xfail_strict=true markers= @@ -5,3 +7,4 @@ markers= xfail_gold: this test is expected to fail in the gold pass xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass xfail_compare: this test is expected to fail in the comparison pass +addopts = --tb=native diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_polars/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/custreamz/custreamz/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index c2fb5f0dce4..eac0f748257 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] like( column_view source_strings, string_scalar pattern, - string_scalar escape) except + + string_scalar escape_character) except + + + cdef unique_ptr[column] like( + column_view source_strings, + column_view patterns, + string_scalar escape_character) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd index 12cd628fc1f..b7166167cfd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd @@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + + + cdef unique_ptr[column] extract_all_record( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index 410ff58f299..59262820411 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] repeat_strings( - column_view strings, - size_type repeat) except + + column_view input, + size_type repeat_times) except + cdef unique_ptr[column] repeat_strings( - column_view strings, - column_view repeats) except + + column_view input, + column_view repeat_times) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..d3065cf8667 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx + regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..6848c8e6e86 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,6 +5,7 @@ from . cimport ( case, char_types, contains, + extract, find, regex_flags, regex_program, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..bba86e818cc 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,9 +5,11 @@ case, char_types, contains, + extract, find, regex_flags, regex_program, + repeat, replace, slice, ) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd index 2cd4891a0ea..6146a1119d6 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/strings/contains.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column contains_re(Column input, RegexProgram prog) + +cpdef Column count_re(Column input, RegexProgram prog) + +cpdef Column matches_re(Column input, RegexProgram prog) + +cpdef Column like( + Column input, + ColumnOrScalar pattern, + Scalar escape_character = * +) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 1a2446f6e2c..82bd1fbea32 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -1,8 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cython.operator import dereference + from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram @@ -32,9 +38,131 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = cpp_contains.contains_re( + result = move(cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column count_re( + Column input, + RegexProgram prog +): + """Returns the number of times the given regex_program's pattern + matches in each string. + + For details, see :cpp:func:`cudf::strings::count_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of match counts for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.count_re( input.view(), prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column matches_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which + matching the given regex_program object but only at + the beginning the string. + + For details, see :cpp:func:`cudf::strings::matches_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.matches_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None): + """ + Returns a boolean column identifying rows which + match the given like pattern. + + For details, see :cpp:func:`cudf::strings::like`. + + Parameters + ---------- + input : Column + The input strings + pattern : Column or Scalar + Like patterns to match within each string + escape_character : Scalar + Optional character specifies the escape prefix. + Default is no escape character. + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + cdef unique_ptr[column] result + + if escape_character is None: + escape_character = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) ) + cdef const string_scalar* c_escape_character = ( + escape_character.c_obj.get() + ) + cdef const string_scalar* c_pattern + + if ColumnOrScalar is Column: + with nogil: + result = move(cpp_contains.like( + input.view(), + pattern.view(), + dereference(c_escape_character) + )) + elif ColumnOrScalar is Scalar: + c_pattern = (pattern.c_obj.get()) + with nogil: + result = move(cpp_contains.like( + input.view(), + dereference(c_pattern), + dereference(c_escape_character) + )) + else: + raise ValueError("pattern must be a Column or a Scalar") + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd new file mode 100644 index 00000000000..3871f5a0e4e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog) + +cpdef Column extract_all_record(Column input, RegexProgram prog) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx new file mode 100644 index 00000000000..dcb11ca10ce --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport extract as cpp_extract +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog): + """ + Returns a table of strings columns where each column + corresponds to the matching group specified in the given + egex_program object. + + For details, see :cpp:func:`cudf::strings::extract`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Table + Columns of strings extracted from the input column. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column extract_all_record(Column input, RegexProgram prog): + """ + Returns a lists column of strings where each string column + row corresponds to the matching group specified in the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::extract_all_record`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Column + Lists column containing strings extracted from the input column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd new file mode 100644 index 00000000000..bc70926b6fa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnorSizeType: + Column + size_type + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx new file mode 100644 index 00000000000..5f627218f6e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport repeat as cpp_repeat +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): + """ + Repeat each string in the given strings column by the numbers + of times given in another numeric column. + + For details, see :cpp:func:`cudf::strings::repeat`. + + Parameters + ---------- + input : Column + The column containing strings to repeat. + repeat_times : Column or int + Number(s) of times that the corresponding input strings + for each row are repeated. + + Returns + ------- + Column + New column containing the repeated strings. + """ + cdef unique_ptr[column] c_result + + if ColumnorSizeType is Column: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() + ) + ) + elif ColumnorSizeType is size_type: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times + ) + ) + else: + raise ValueError("repeat_times must be size_type or integer") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini index 1761c0f011c..f572f85ca49 100644 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/tests/pytest.ini @@ -6,3 +6,4 @@ filterwarnings = error ignore:::.*xdist.* ignore:::.*pytest.* +addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4f88e09183f..4e4dd7cbb00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat): pa_target_col, pa_target_scalar.as_py() ) assert_column_eq(got, expected) + + +def test_count_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["A1a2A3a4", "A1A2A3", None]) + result = plc.strings.contains.count_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.count_substring_regex(arr, pattern) + assert_column_eq(result, expected) + + +def test_match_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["1a2b", "b1a2", None]) + result = plc.strings.contains.matches_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.match_substring_regex(arr, f"^{pattern}") + assert_column_eq(result, expected) + + +def test_like(): + pattern = "%a" + arr = pa.array(["1a2aa3aaa"]) + result = plc.strings.contains.like( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(pa.array([pattern])), + ) + expected = pc.match_like(arr, pattern) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py new file mode 100644 index 00000000000..788b86423c4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_extract(): + pattern = "([ab])(\\d)" + pa_pattern = "(?P[ab])(?P\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pc.extract_regex(arr, pa_pattern) + for i, result_col in enumerate(result.itercolumns()): + expected_col = pa.chunked_array(expected.field(i)) + assert result_col.fill_null("").equals(expected_col) + + +def test_extract_all_record(): + pattern = "([ab])(\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract_all_record( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + [pa.array([["a", "1"], ["b", "2"], None], type=result.type)] + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py new file mode 100644 index 00000000000..18b5d8bf4d0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest + + +@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) +def test_repeat_strings(repeats): + arr = pa.array(["1", None]) + plc_result = plc.strings.repeat.repeat_strings( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(repeats) + if not isinstance(repeats, int) + else repeats, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.binary_repeat(arr, repeats)) + assert result.equals(expected)