diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d6d3e3fdd33..b5d17022a3a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-publish-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index d670132cca9..10c803f7921 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a4a8f036174..b515dbff9f3 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,7 +37,7 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 if: always() with: needs: ${{ toJSON(needs) }} @@ -52,7 +52,7 @@ jobs: steps: - name: Get PR info id: get-pr-info - uses: rapidsai/shared-actions/get-pr-info@main + uses: nv-gha-runners/get-pr-info@main - name: Checkout code repo uses: actions/checkout@v4 with: @@ -104,39 +104,39 @@ jobs: - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_cpp == 'true' with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -145,7 +145,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -153,7 +153,7 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_java == 'true' with: build_type: pull-request @@ -164,7 +164,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -174,7 +174,7 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_notebooks == 'true' with: build_type: pull-request @@ -185,7 +185,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -195,7 +195,7 @@ jobs: wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -204,21 +204,21 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -226,7 +226,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -235,7 +235,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -247,7 +247,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -256,7 +256,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -265,7 +265,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 with: arch: '["amd64"]' cuda: '["12.5"]' @@ -276,7 +276,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -287,7 +287,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -299,7 +299,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index fe77ad4b6b2..45e5191eb54 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4af6a0d690d..8605fa46f68 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,7 +45,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +85,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +97,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -126,7 +126,7 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index f62f7885d63..8f8c2adac2f 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12 ### Conda -cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel: +cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: ```bash conda install -c rapidsai -c conda-forge -c nvidia \ diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 93a815838b7..7a12db927e5 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -68,8 +68,18 @@ def emoji_failed(x): pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() diff_df = pr_df - main_df +total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] +pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) +pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) -pr_df = pr_df[["total", "passed", "failed", "skipped"]] +cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) +gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) + +# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns +pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' +pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' + +pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] diff_df = diff_df[["total", "passed", "failed", "skipped"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) @@ -95,6 +105,8 @@ def emoji_failed(x): print(comment) print() +print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print() print("Here are the results of running the Pandas tests against this PR:") print() print(df.to_markdown()) diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 28ded2f8e0f..a701bfe15e0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf" pushd python/pylibcudf/pylibcudf/tests python -m pytest \ --cache-clear \ + --numprocesses=8 \ --dist=worksteal \ . popd diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 0d39807d56c..361a42ccda9 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd @@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 3bf9d02b384..abc6f74fccf 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp) # ################################################################################################## # * nds-h benchmark -------------------------------------------------------------------------------- -ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp) # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index 61e79a47a50..e4ff0c8c4a7 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state) state.add_global_memory_reads(num_rows); // add memory read from bitmaks if (!no_nulls) { - state.add_global_memory_reads(2 * + state.add_global_memory_reads(2L * cudf::bitmask_allocation_size_bytes(num_rows)); } // memory written depends on used hash @@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state) }); } else if (hash_name == "md5") { // md5 creates a 32-byte string - state.add_global_memory_writes(32 * num_rows); + state.add_global_memory_writes(32L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); }); } else if (hash_name == "sha1") { // sha1 creates a 40-byte string - state.add_global_memory_writes(40 * num_rows); + state.add_global_memory_writes(40L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); }); } else if (hash_name == "sha224") { // sha224 creates a 56-byte string - state.add_global_memory_writes(56 * num_rows); + state.add_global_memory_writes(56L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); }); } else if (hash_name == "sha256") { // sha256 creates a 64-byte string - state.add_global_memory_writes(64 * num_rows); + state.add_global_memory_writes(64L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); }); } else if (hash_name == "sha384") { // sha384 creates a 96-byte string - state.add_global_memory_writes(96 * num_rows); + state.add_global_memory_writes(96L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); }); } else if (hash_name == "sha512") { // sha512 creates a 128-byte string - state.add_global_memory_writes(128 * num_rows); + state.add_global_memory_writes(128L * num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); }); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 7563c823454..ce115fd7723 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64; void parquet_read_common(cudf::size_type num_rows_to_read, cudf::size_type num_cols_to_read, cuio_source_sink_pair& source_sink, - nvbench::state& state) + nvbench::state& state, + size_t table_data_size = data_size) { cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read, }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(static_cast(table_data_size) / time, "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list +void BM_parquet_read_wide_tables(nvbench::state& state, + nvbench::type_list> type_list) +{ + auto const d_type = get_type_or_group(static_cast(DataType)); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + +void BM_parquet_read_wide_tables_mixed(nvbench::state& state) +{ + auto const d_type = []() { + auto d_type1 = get_type_or_group(static_cast(data_type::INTEGRAL)); + auto d_type2 = get_type_or_group(static_cast(data_type::FLOAT)); + d_type1.reserve(d_type1.size() + d_type2.size()); + std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1)); + return d_type1; + }(); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + using d_type_list = nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table)) + .set_name("parquet_read_wide_tables") + .set_min_samples(4) + .set_type_axes_names({"data_type"}) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed) + .set_name("parquet_read_wide_tables_mixed") + .set_min_samples(4) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + // a benchmark for structs that only contain fixed-width types using d_type_list_struct_only = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only)) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 3abd4280081..7121cb9f034 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) } std::tuple, size_t, size_t> write_file_data( - nvbench::state& state, std::vector const& d_types) + nvbench::state& state, std::vector const& d_types, io_type io_source_type) { cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); @@ -63,7 +63,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_source_type}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .set_name("parquet_multithreaded_read_decode_fixed_width") @@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .set_name("parquet_multithreaded_read_decode_string") @@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .set_name("parquet_multithreaded_read_decode_list") @@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); // mixed data types: fixed width, strings NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) @@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .set_name("parquet_multithreaded_read_decode_chunked_fixed_width") @@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .set_name("parquet_multithreaded_read_decode_chunked_string") @@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .set_name("parquet_multithreaded_read_decode_chunked_list") @@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp new file mode 100644 index 00000000000..adc3dddc59c --- /dev/null +++ b/cpp/benchmarks/text/word_minhash.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_word_minhash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const base64 = state.get_int64("hash_type") == 64; + + data_profile const strings_profile = + data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + auto strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + + auto const num_offsets = (num_rows / row_width) + 1; + auto offsets = cudf::sequence(num_offsets, + cudf::numeric_scalar(0), + cudf::numeric_scalar(row_width)); + + auto source = cudf::make_lists_column(num_offsets - 1, + std::move(offsets), + std::move(strings_table->release().front()), + 0, + rmm::device_buffer{}); + + data_profile const seeds_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::NORMAL, 0, 256); + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); // output are hashes + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) + : nvtext::word_minhash(source->view(), seeds.view()); + }); +} + +NVBENCH_BENCH(bench_word_minhash) + .set_name("word_minhash") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) + .add_int64_axis("row_width", {10, 100, 1000}) + .add_int64_axis("seed_count", {2, 25}) + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index b257eef1e9e..4255faea702 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation); * * @tparam F Type of callable * @param k The `aggregation::Kind` value to dispatch - * aram f The callable that accepts an `aggregation::Kind` non-type template - * argument. + * @param f The callable that accepts an `aggregation::Kind` callable function object. * @param args Parameter pack forwarded to the `operator()` invocation * @return Forwards the return value of the callable. */ @@ -1626,6 +1625,7 @@ struct dispatch_source { * parameter of the callable `F` * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind` * non-type template parameter for the second template parameter of the callable + * @param f The callable that accepts `data_type` and `aggregation::Kind` function object. * @param args Parameter pack forwarded to the `operator()` invocation * `F`. */ @@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d * @brief Returns the target `data_type` for the specified aggregation k * performed on elements of type source_type. * - * aram source_type The element type to be aggregated - * aram k The aggregation + * @param source_type The element type to be aggregated + * @param k The aggregation kind * @return data_type The target_type of k performed on source_type * elements */ diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 672b95e2d01..80a4460023f 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -143,29 +143,28 @@ std::unique_ptr make_tdigest_column(size_type num_rows, rmm::device_async_resource_ref mr); /** - * @brief Create a tdigest column of empty clusters. + * @brief Create an empty tdigest column. * - * The column created contains the specified number of rows of empty clusters. + * An empty tdigest column contains a single row of length 0 * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns A tdigest column of empty clusters. + * @returns An empty tdigest column. */ CUDF_EXPORT -std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Create a scalar of an empty tdigest cluster. + * @brief Create an empty tdigest scalar. * - * The returned scalar is a struct_scalar that contains a single row of an empty cluster. + * An empty tdigest scalar is a struct_scalar that contains a single row of length 0 * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns A scalar of an empty tdigest cluster. + * @returns An empty tdigest scalar. */ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 73ff17b2b93..940d03cdb41 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer& inda * @brief Normalize unquoted whitespace (space and tab characters) using FST * * @param indata Input device buffer + * @param col_offsets Offsets to column contents in input buffer + * @param col_lengths Length of contents of each row in column * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation + * + * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents + * of each row */ -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace io::json::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ed7b2ac0850..ee03a382bec 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,8 +39,9 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; ///< 128MB per row group -constexpr size_type default_row_group_size_rows = 1000000; ///< 1 million rows per row group +constexpr size_t default_row_group_size_bytes = + std::numeric_limits::max(); ///< Infinite bytes per row group +constexpr size_type default_row_group_size_rows = 1'000'000; ///< 1 million rows per row group constexpr size_t default_max_page_size_bytes = 512 * 1024; ///< 512KB per page constexpr size_type default_max_page_size_rows = 20000; ///< 20k rows per page constexpr int32_t default_column_index_truncate_length = 64; ///< truncate to 64 bytes diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f7108129dee..4f3fc7086f2 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -35,10 +35,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index abb26d7ccb4..14695c3bb27 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index be7d19b2227..1758790cd64 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -562,12 +562,12 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto b = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index c83a4260c19..7c909f1a948 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,7 @@ std::unique_ptr minhash( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -133,7 +134,7 @@ std::unique_ptr minhash64( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -150,5 +151,61 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm though + * only the first 64-bits of the hash are used in computing the output. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash64( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 8890c786287..756047d383a 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -625,6 +626,8 @@ void make_device_json_column(device_span input, auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); std::vector is_mixed_type_column(num_columns, 0); std::vector is_pruned(num_columns, 0); + // for columns that are not mixed type but have been forced as string + std::vector forced_as_string_column(num_columns); columns.try_emplace(parent_node_sentinel, std::ref(root)); std::function remove_child_columns = @@ -695,11 +698,14 @@ void make_device_json_column(device_span input, // Struct, List, String, Value auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // if parent is mixed type column or this column is pruned, ignore this column. + // if parent is mixed type column or this column is pruned or if parent + // has been forced as string, ignore this column. if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || + forced_as_string_column[parent_col_id]) { ignore_vals[this_col_id] = 1; if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } continue; } @@ -765,22 +771,26 @@ void make_device_json_column(device_span input, } auto this_column_category = column_categories[this_col_id]; - if (is_enabled_mixed_types_as_string) { - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - is_mixed_type_column[this_col_id] = 1; - this_column_category = NC_STR; - } + // get path of this column, check if it is a struct/list forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + this_column_category = NC_STR; } CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); // move into parent device_json_column col(stream, mr); initialize_json_columns(this_col_id, col, this_column_category); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + col.forced_as_string_column = true; + forced_as_string_column[this_col_id] = true; + } + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); if (not replaced) parent_col.column_order.push_back(name); @@ -802,12 +812,30 @@ void make_device_json_column(device_span input, is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + } + + // ignore all children of columns forced as string + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { + forced_as_string_column[this_col_id] = true; + ignore_vals[this_col_id] = 1; + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and + forced_as_string_column[this_col_id]) + column_categories[this_col_id] = NC_STR; } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { @@ -982,39 +1010,58 @@ std::pair, std::vector> device_json_co "string offset, string length mismatch"); rmm::device_uvector d_string_data(col_size, stream); // TODO how about directly storing pair in json_column? - auto offset_length_it = - thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - data_type target_type{}; + auto [result_bitmask, null_count] = make_validity(json_col); - if (schema.has_value()) { + data_type target_type{}; + std::unique_ptr col{}; + if (options.normalize_whitespace && json_col.forced_as_string_column) { + CUDF_EXPECTS(prune_columns || options.mixed_types_as_string, + "Whitespace normalization of nested columns requested as string requires " + "either prune_columns or mixed_types_as_string to be enabled"); + auto [normalized_d_input, col_offsets, col_lengths] = + cudf::io::json::detail::normalize_whitespace( + d_input, json_col.string_offsets, json_col.string_lengths, stream, mr); + auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin()); + target_type = data_type{type_id::STRING}; + // Convert strings to the inferred data type + col = parse_data(normalized_d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); + } else { + auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), + json_col.string_lengths.begin()); + if (schema.has_value()) { #ifdef NJP_DEBUG_PRINT - std::cout << "-> explicit type: " - << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) - : "n/a"); + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); #endif - target_type = schema.value().type; - } else if (json_col.forced_as_string_column) { - target_type = data_type{type_id::STRING}; - } - // Infer column type, if we don't have an explicit type for it - else { - target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, offset_length_it, col_size, stream); + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + options.json_view(), d_input, offset_length_it, col_size, stream); + } + // Convert strings to the inferred data type + col = parse_data(d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); } - auto [result_bitmask, null_count] = make_validity(json_col); - // Convert strings to the inferred data type - auto col = parse_data(d_input.data(), - offset_length_it, - col_size, - target_type, - std::move(result_bitmask), - null_count, - options.view(), - stream, - mr); - // Reset nullable if we do not have nulls // This is to match the existing JSON reader's behaviour: // - Non-string columns will always be returned as nullable @@ -1120,11 +1167,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref()); // gpu tree generation - return get_tree_representation(tokens_gpu, - token_indices_gpu, - options.is_enabled_mixed_types_as_string(), - stream, - cudf::get_current_device_resource_ref()); + // Note that to normalize whitespaces in nested columns coerced to be string, we need the column + // to either be of mixed type or we need to request the column to be returned as string by + // pruning it with the STRING dtype + return get_tree_representation( + tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(), + stream, + cudf::get_current_device_resource_ref()); }(); // IILE used to free memory of token data. #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 97d5884fef1..2d435dc8e1a 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -17,6 +17,7 @@ #include "io/fst/lookup_tables.cuh" #include +#include #include #include #include @@ -25,8 +26,17 @@ #include #include #include - +#include + +#include +#include +#include +#include +#include +#include #include +#include +#include #include #include @@ -215,14 +225,6 @@ std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - * Such strings are not part of the JSON standard (characters allowed within quotes should - * have ASCII at least 0x20 i.e. space character and above) but may be encountered while - * reading JSON files */ enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; // Aliases for readability of the transition table @@ -255,17 +257,17 @@ struct TransduceToNormalizedWS { // Let the alphabet set be Sigma // --------------------------------------- // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol + // Input symbol translates to output symbol // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma + // DQS | Sigma -> + // OOS | Sigma\{,\t} -> + // DEC | Sigma -> // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> + // Output symbol same as input symbol + // OOS | {} -> {} + // OOS | {\t} -> {\t} - // Case when read symbol is a space or tab but is unquoted + // Case when read symbol is not an unquoted space or tab // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function // However, since there is no output in this case i.e. the count returned by // operator()(state_id, match_id, read_symbol) is zero, this function is never called. @@ -287,8 +289,8 @@ struct TransduceToNormalizedWS { SymbolT const read_symbol) const { // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { + if (!(match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS))) { return 0; } return 1; @@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer& inda std::swap(indata, outdata); } -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - static constexpr std::int32_t min_out = 0; - static constexpr std::int32_t max_out = 2; + /* + * Algorithm: + 1. Create a single buffer by concatenating the rows of the string column. Create segment offsets + and lengths array for concatenated buffer + 2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted + whitespace characters, and outputs indices of unquoted whitespace characters + 3. Update segment lengths based on the number of output indices between segment offsets + 4. Remove characters at output indices from concatenated buffer. + 5. Return updated buffer, segment lengths and updated segment offsets + */ + auto inbuf_lengths = cudf::detail::make_device_uvector_async( + col_lengths, stream, cudf::get_current_device_resource_ref()); + size_t inbuf_lengths_size = inbuf_lengths.size(); + size_type inbuf_size = + thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end()); + rmm::device_uvector inbuf(inbuf_size, stream); + rmm::device_uvector inbuf_offsets(inbuf_lengths_size, stream); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + auto input_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__( + size_t i) -> char const* { return &d_input[col_offsets[i]]; })); + auto output_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__( + size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; })); + + { + // cub device batched copy + size_t temp_storage_bytes = 0; + cub::DeviceCopy::Batched(nullptr, + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + rmm::device_buffer temp_storage(temp_storage_bytes, stream); + cub::DeviceCopy::Batched(temp_storage.data(), + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + } + + // whitespace normalization : get the indices of the unquoted whitespace characters auto parser = fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), - fst::detail::make_translation_functor( + fst::detail::make_translation_functor( normalize_whitespace::TransduceToNormalizedWS{}), stream); - rmm::device_buffer outbuf(indata.size(), stream, mr); - rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(indata.data()), - static_cast(indata.size()), - static_cast(outbuf.data()), + rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_indices_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), thrust::make_discard_iterator(), - outbuf_size.data(), + outbuf_indices.data(), + outbuf_indices_size.data(), normalize_whitespace::start_state, stream); - outbuf.resize(outbuf_size.value(stream), stream); - datasource::owning_buffer outdata(std::move(outbuf)); - std::swap(indata, outdata); + auto const num_deletions = outbuf_indices_size.value(stream); + outbuf_indices.resize(num_deletions, stream); + + // now these indices need to be removed + // TODO: is there a better way to do this? + thrust::for_each( + rmm::exec_policy_nosync(stream), + outbuf_indices.begin(), + outbuf_indices.end(), + [inbuf_offsets_begin = inbuf_offsets.begin(), + inbuf_offsets_end = inbuf_offsets.end(), + inbuf_lengths = inbuf_lengths.begin()] __device__(size_type idx) { + auto it = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx); + auto pos = thrust::distance(inbuf_offsets_begin, it) - 1; + cuda::atomic_ref ref{*(inbuf_lengths + pos)}; + ref.fetch_add(-1, cuda::std::memory_order_relaxed); + }); + + auto stencil = cudf::detail::make_zeroed_device_uvector_async( + static_cast(inbuf_size), stream, cudf::get_current_device_resource_ref()); + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_constant_iterator(true), + thrust::make_constant_iterator(true) + num_deletions, + outbuf_indices.begin(), + stencil.begin()); + thrust::remove_if(rmm::exec_policy_nosync(stream), + inbuf.begin(), + inbuf.end(), + stencil.begin(), + thrust::identity()); + inbuf.resize(inbuf_size - num_deletions, stream); + + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + stream.synchronize(); + return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)}; } } // namespace detail diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4e513d3495c..1c15e147b13 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - parse_opts.dayfirst = options.is_enabled_dayfirst(); - parse_opts.keepquotes = options.is_enabled_keep_quotes(); - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.dayfirst = options.is_enabled_dayfirst(); + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.normalize_whitespace = options.is_enabled_normalize_whitespace(); + parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); std::vector na_values{"", "null"}; na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index bd82b040359..99a5b17bce8 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span> sources, normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); } - // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is - // enabled, invoke pre-processing FST - if (reader_opts.is_enabled_normalize_whitespace()) { - normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref()); - } - auto buffer = cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); stream.synchronize(); diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index a3f91f6859b..9ed2929a70e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s, { bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; - constexpr int batch_size = 32; + constexpr int batch_size = cudf::detail::warp_size; int cur_leaf_count = target_leaf_count; while (s->error == 0 && s->nz_count < target_leaf_count && s->input_value_count < s->num_input_values) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 5d10472b0ae..7c985643887 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -203,10 +203,9 @@ struct SchemaElement { bool operator==(SchemaElement const& other) const { return type == other.type && converted_type == other.converted_type && - type_length == other.type_length && repetition_type == other.repetition_type && - name == other.name && num_children == other.num_children && - decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision && - field_id == other.field_id; + type_length == other.type_length && name == other.name && + num_children == other.num_children && decimal_scale == other.decimal_scale && + decimal_precision == other.decimal_precision && field_id == other.field_id; } // the parquet format is a little squishy when it comes to interpreting diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 125d35f6499..1390339c1ae 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -400,7 +400,8 @@ struct ColumnChunkDesc { int32_t src_col_schema_, column_chunk_info const* chunk_info_, float list_bytes_per_row_est_, - bool strings_to_categorical_) + bool strings_to_categorical_, + int32_t src_file_idx_) : compressed_data(compressed_data_), compressed_size(compressed_size_), num_values(num_values_), @@ -419,7 +420,8 @@ struct ColumnChunkDesc { src_col_schema(src_col_schema_), h_chunk_info(chunk_info_), list_bytes_per_row_est(list_bytes_per_row_est_), - is_strings_to_cat(strings_to_categorical_) + is_strings_to_cat(strings_to_categorical_), + src_file_idx(src_file_idx_) { } @@ -456,6 +458,7 @@ struct ColumnChunkDesc { bool is_strings_to_cat{}; // convert strings to hashes bool is_large_string_col{}; // `true` if string data uses 64-bit offsets + int32_t src_file_idx{}; // source file index }; /** diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 245e1829c72..c588fedb85c 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -1511,10 +1511,13 @@ void reader::impl::create_global_chunk_info() std::transform( _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) { // translate schema_idx into something we can use for the page indexes - if (auto it = std::find_if( - columns.begin(), - columns.end(), - [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; }); + if (auto it = std::find_if(columns.begin(), + columns.end(), + [&](auto const& col_chunk) { + return col_chunk.schema_idx == + _metadata->map_schema_index(col.schema_idx, + rg.source_index); + }); it != columns.end()) { return std::distance(columns.begin(), it); } @@ -1535,7 +1538,8 @@ void reader::impl::create_global_chunk_info() auto col = _input_columns[i]; // look up metadata auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); + auto& schema = _metadata->get_schema( + _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index); auto [clock_rate, logical_type] = conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()), @@ -1574,9 +1578,9 @@ void reader::impl::create_global_chunk_info() col.schema_idx, chunk_info, list_bytes_per_row_est, - schema.type == BYTE_ARRAY and _strings_to_categorical)); + schema.type == BYTE_ARRAY and _strings_to_categorical, + rg.source_index)); } - // Adjust for skip_rows when updating the remaining rows after the first group remaining_rows -= (skip_rows) ? std::min(rg.start_row + row_group.num_rows - skip_rows, remaining_rows) diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 8b5678f202b..6d566b5815e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf std::vector chunks(rg.columns.size()); for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) { - auto const& col_chunk = rg.columns[col_idx]; - auto& schema = get_schema(col_chunk.schema_idx); + auto const& col_chunk = rg.columns[col_idx]; + auto const is_schema_idx_mapped = + is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index); + auto const mapped_schema_idx = is_schema_idx_mapped + ? map_schema_index(col_chunk.schema_idx, rg_info.source_index) + : col_chunk.schema_idx; + auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0); auto const max_def_level = schema.max_definition_level; auto const max_rep_level = schema.max_repetition_level; @@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata( num_rows(calc_num_rows()), num_row_groups(calc_num_row_groups()) { - // Validate that all sources have the same schema unless we are reading select columns - // from mismatched sources, in which case, we will only check the projected columns later. - if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) { - auto const& first_meta = per_file_metadata.front(); + if (per_file_metadata.size() > 1) { + auto& first_meta = per_file_metadata.front(); auto const num_cols = first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0; - auto const& schema = first_meta.schema; - - // Verify that the input files have matching numbers of columns and schema. - for (auto const& pfm : per_file_metadata) { - if (pfm.row_groups.size() > 0) { - CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), - "All sources must have the same number of columns"); + auto& schema = first_meta.schema; + + // Validate that all sources have the same schema unless we are reading select columns + // from mismatched sources, in which case, we will only check the projected columns later. + if (not has_cols_from_mismatched_srcs) { + // Verify that the input files have matching numbers of columns and schema. + for (auto const& pfm : per_file_metadata) { + if (pfm.row_groups.size() > 0) { + CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), + "All sources must have the same number of columns"); + } + CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); } - CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); } + + // Mark the column schema in the first (default) source as nullable if it is nullable in any of + // the input sources. This avoids recomputing this within build_column() and + // populate_metadata(). + std::for_each( + thrust::make_counting_iterator(static_cast(1)), + thrust::make_counting_iterator(schema.size()), + [&](auto const schema_idx) { + if (schema[schema_idx].repetition_type == REQUIRED and + std::any_of( + per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) { + return pfm.schema[schema_idx].repetition_type != REQUIRED; + })) { + schema[schema_idx].repetition_type = OPTIONAL; + } + }); } // Collect and apply arrow:schema from Parquet's key value metadata section @@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t size_type src_idx, int schema_idx) const { - // schema_idx_maps will only have > 0 size when we are reading matching column projection from - // mismatched Parquet sources. - if (src_idx and not schema_idx_maps.empty()) { - auto const& schema_idx_map = schema_idx_maps[src_idx - 1]; - CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(), - "Unmapped schema index encountered in the specified source tree", - std::range_error); - schema_idx = schema_idx_map.at(schema_idx); - } + // Map schema index to the provided source file index + schema_idx = map_schema_index(schema_idx, src_idx); auto col = std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), @@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const return rg_metadata; } +bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const +{ + // Check if schema_idx or pfm_idx is invalid + CUDF_EXPECTS( + schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), + "Parquet reader encountered an invalid schema_idx or pfm_idx", + std::out_of_range); + + // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e. + // schemas are identical). + if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; } + + // Check if mapped + auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + return schema_idx_map.find(schema_idx) != schema_idx_map.end(); +} + +int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const +{ + // Check if schema_idx or pfm_idx is invalid + CUDF_EXPECTS( + schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), + "Parquet reader encountered an invalid schema_idx or pfm_idx", + std::out_of_range); + + // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e. + // schemas are identical). + if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; } + + // schema_idx_maps will only have > 0 size when we are reading matching column projection from + // mismatched Parquet sources. + auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(), + "Unmapped schema index encountered in the specified source tree", + std::out_of_range); + + // Return the mapped schema idx. + return schema_idx_map.at(schema_idx); +} + std::string aggregate_reader_metadata::get_pandas_index() const { // Assumes that all input files have the same metadata @@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns( // Compares two schema elements to be equal except their number of children auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) { return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and - lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and - lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and + lhs.type_length == rhs.type_length and lhs.name == rhs.name and + lhs.decimal_scale == rhs.decimal_scale and lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id; }; @@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns( "the selected path", std::invalid_argument); + // Get the schema_idx_map for this data source (pfm) + auto& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + // Map the schema index from 0th tree (src) to the one in the current (dst) tree. + schema_idx_map[src_schema_idx] = dst_schema_idx; + // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer // hierarchy. So continue on with mapping. if (src_schema_elem.is_stub()) { @@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns( pfm_idx); }); } - - // We're at a leaf and this is an input column (one with actual data stored) so map it. - if (src_schema_elem.num_children == 0) { - // Get the schema_idx_map for this data source (pfm) - auto& schema_idx_map = schema_idx_maps[pfm_idx - 1]; - - // Map the schema index from 0th tree (src) to the one in the current (dst) tree. - schema_idx_map[src_schema_idx] = dst_schema_idx; - } }; std::vector output_column_schemas; diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 6f2863136b2..6487c92f48f 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -234,6 +234,26 @@ class aggregate_reader_metadata { [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; } + /** + * @brief Checks if a schema index from 0th source is mapped to the specified file index + * + * @param schema_idx The index of the SchemaElement in the zeroth file. + * @param pfm_idx The index of the file (per_file_metadata) to check mappings for. + * + * @return True if schema index is mapped + */ + [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const; + + /** + * @brief Maps schema index from 0th source file to the specified file index + * + * @param schema_idx The index of the SchemaElement in the zeroth file. + * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to. + * + * @return Mapped schema index + */ + [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const; + /** * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file * @@ -248,7 +268,7 @@ class aggregate_reader_metadata { CUDF_EXPECTS( schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), "Parquet reader encountered an invalid schema_idx or pfm_idx", - std::invalid_argument); + std::out_of_range); return per_file_metadata[pfm_idx].schema[schema_idx]; } @@ -256,7 +276,10 @@ class aggregate_reader_metadata { [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); } /** - * @brief Gets the concrete nesting depth of output cudf columns + * @brief Gets the concrete nesting depth of output cudf columns. + * + * Gets the nesting depth of the output cudf column for the given schema. + * The nesting depth must be equal for the given schema_index across all sources. * * @param schema_index Schema index of the input column * diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 52918f5bc80..8e67f233213 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_str * is indicated when adding new values. This function generates the mappings of * the R/D levels to those start/end bounds * - * @param remap Maps column schema index to the R/D remapping vectors for that column - * @param src_col_schema The column schema to generate the new mapping for + * @param remap Maps column schema index to the R/D remapping vectors for that column for a + * particular input source file + * @param src_col_schema The source column schema to generate the new mapping for + * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file + * @param src_file_idx The input source file index for the column schema * @param md File metadata information */ -void generate_depth_remappings(std::map, std::vector>>& remap, - int src_col_schema, - aggregate_reader_metadata const& md) +void generate_depth_remappings( + std::map, std::pair, std::vector>>& remap, + int const src_col_schema, + int const mapped_src_col_schema, + int const src_file_idx, + aggregate_reader_metadata const& md) { // already generated for this level - if (remap.find(src_col_schema) != remap.end()) { return; } - auto schema = md.get_schema(src_col_schema); - int max_depth = md.get_output_nesting_depth(src_col_schema); + if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; } + auto const& schema = md.get_schema(mapped_src_col_schema, src_file_idx); + auto const max_depth = md.get_output_nesting_depth(src_col_schema); - CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(), + CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(), "Attempting to remap a schema more than once"); auto inserted = - remap.insert(std::pair, std::vector>>{src_col_schema, {}}); + remap.insert(std::pair, std::pair, std::vector>>{ + {src_col_schema, src_file_idx}, {}}); auto& depth_remap = inserted.first->second; std::vector& rep_depth_remap = (depth_remap.first); @@ -136,15 +143,15 @@ void generate_depth_remappings(std::map, std::ve auto find_shallowest = [&](int r) { int shallowest = -1; int cur_depth = max_depth - 1; - int schema_idx = src_col_schema; + int schema_idx = mapped_src_col_schema; while (schema_idx > 0) { - auto cur_schema = md.get_schema(schema_idx); + auto& cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_repetition_level == r) { // if this is a repeated field, map it one level deeper shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth; } // if it's one-level encoding list - else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) { + else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) { shallowest = cur_depth - 1; } if (!cur_schema.is_stub()) { cur_depth--; } @@ -159,10 +166,10 @@ void generate_depth_remappings(std::map, std::ve for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) { auto find_deepest = [&](int d) { SchemaElement prev_schema; - int schema_idx = src_col_schema; + int schema_idx = mapped_src_col_schema; int r1 = 0; while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); + SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_definition_level == d) { // if this is a repeated field, map it one level deeper r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level @@ -175,10 +182,10 @@ void generate_depth_remappings(std::map, std::ve // we now know R1 from above. return the deepest nesting level that has the // same repetition level - schema_idx = src_col_schema; + schema_idx = mapped_src_col_schema; int depth = max_depth - 1; while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); + SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_repetition_level == r1) { // if this is a repeated field, map it one level deeper depth = cur_schema.is_stub() ? depth + 1 : depth; @@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info() std::vector per_page_nesting_info_size(num_columns); auto iter = thrust::make_counting_iterator(size_type{0}); std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) { + // Schema index of the current input column auto const schema_idx = _input_columns[i].schema_idx; - auto const& schema = _metadata->get_schema(schema_idx); - return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx)); + // Get the max_definition_level of this column across all sources. + auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1; + std::for_each(thrust::make_counting_iterator(static_cast(1)), + thrust::make_counting_iterator(_sources.size()), + [&](auto const src_file_idx) { + auto const& schema = _metadata->get_schema( + _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx); + max_definition_level = + std::max(max_definition_level, schema.max_definition_level + 1); + }); + + return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx)); }); // compute total # of page_nesting infos needed and allocate space. doing this in one @@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info() page_nesting_decode_info.device_ptr() + src_info_index; pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx]; + // Set the number of output nesting levels from the zeroth source as nesting must be + // identical across sources. pages[target_page_index + p_idx].num_output_nesting_levels = _metadata->get_output_nesting_depth(src_col_schema); @@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info() target_page_index += subpass.column_page_count[idx]; } + // Reset the target_page_index + target_page_index = 0; + // fill in int nesting_info_index = 0; - std::map, std::vector>> depth_remapping; for (size_t idx = 0; idx < _input_columns.size(); idx++) { auto const src_col_schema = _input_columns[idx].schema_idx; - // schema of the input column - auto& schema = _metadata->get_schema(src_col_schema); // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc) + // nesting depth must be same across sources so getting it from the zeroth source is ok int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema); + // Map to store depths if this column has lists + std::map, std::pair, std::vector>> depth_remapping; // if this column has lists, generate depth remapping - std::map, std::vector>> depth_remapping; - if (schema.max_repetition_level > 0) { - generate_depth_remappings(depth_remapping, src_col_schema, *_metadata); - } + std::for_each( + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(_sources.size()), + [&](auto const src_file_idx) { + auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx); + if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) { + generate_depth_remappings( + depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata); + } + }); // fill in host-side nesting info - int schema_idx = src_col_schema; + int schema_idx = src_col_schema; + // This is okay as we only use this to check stubness of cur_schema and + // to get its parent's indices, both of which are one to one mapped. auto cur_schema = _metadata->get_schema(schema_idx); int cur_depth = max_output_depth - 1; while (schema_idx > 0) { @@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) { + // Source file index for the current page. + auto const src_file_idx = + pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx; PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])]; @@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info() &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])]; + auto const mapped_src_col_schema = + _metadata->map_schema_index(src_col_schema, src_file_idx); // if we have lists, set our start and end depth remappings - if (schema.max_repetition_level > 0) { - auto remap = depth_remapping.find(src_col_schema); + if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) { + auto remap = depth_remapping.find({src_col_schema, src_file_idx}); CUDF_EXPECTS(remap != depth_remapping.end(), "Could not find depth remapping for schema"); std::vector const& rep_depth_remap = (remap->second.first); @@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info() } } + // Get the schema from the current input source. + auto& actual_cur_schema = _metadata->get_schema( + _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx); + // values indexed by output column index - nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level; + nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level; pni[cur_depth].size = 0; pni[cur_depth].type = - to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id()); + to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id()); pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL; } @@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info() cur_schema = _metadata->get_schema(schema_idx); } + // Offset the page and nesting info indices + target_page_index += subpass.column_page_count[idx]; nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]); } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 81fd4ab9f82..ec05f35d405 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const table_size = std::reduce(column_sizes.begin(), column_sizes.end()); auto const avg_row_len = util::div_rounding_up_safe(table_size, input.num_rows()); if (avg_row_len > 0) { - auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len); - max_page_fragment_size = std::min(rg_frag_size, max_page_fragment_size); + // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size + // value (=uint64::max) to avoid a sign overflow when comparing + auto const rg_frag_size = + std::min(std::numeric_limits::max(), + util::div_rounding_up_safe(max_row_group_size, avg_row_len)); + // Safe comparison as rg_frag_size fits in size_type + max_page_fragment_size = + std::min(static_cast(rg_frag_size), max_page_fragment_size); } // dividing page size by average row length will tend to overshoot the desired diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index bc2722441d0..734067582f7 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -67,6 +67,8 @@ struct parse_options_view { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::trie_view trie_true; cudf::detail::trie_view trie_false; cudf::detail::trie_view trie_na; @@ -85,6 +87,8 @@ struct parse_options { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; cudf::detail::optional_trie trie_na; @@ -111,6 +115,8 @@ struct parse_options { doublequote, dayfirst, skipblanklines, + normalize_whitespace, + mixed_types_as_string, cudf::detail::make_trie_view(trie_true), cudf::detail::make_trie_view(trie_false), cudf::detail::make_trie_view(trie_na), diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 76cd55bf994..0d017cf1f13 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -292,33 +292,32 @@ std::unique_ptr make_tdigest_column(size_type num_rows, return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr); } -std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto offsets = cudf::make_fixed_width_column( - data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr); + data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), offsets->mutable_view().begin(), offsets->mutable_view().end(), 0); - auto min_col = cudf::make_numeric_column( - data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); + auto min_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), min_col->mutable_view().begin(), min_col->mutable_view().end(), 0); - auto max_col = cudf::make_numeric_column( - data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); + auto max_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), max_col->mutable_view().begin(), max_col->mutable_view().end(), 0); - return make_tdigest_column(num_rows, - cudf::make_empty_column(type_id::FLOAT64), - cudf::make_empty_column(type_id::FLOAT64), + return make_tdigest_column(1, + make_empty_column(type_id::FLOAT64), + make_empty_column(type_id::FLOAT64), std::move(offsets), std::move(min_col), std::move(max_col), @@ -339,7 +338,7 @@ std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release(); + auto contents = make_empty_tdigest_column(stream, mr)->release(); return std::make_unique( std::move(*std::make_unique(std::move(contents.children))), true, stream, mr); } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d591fb5c171..2dd25a7b890 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -366,8 +366,8 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, * @param group_cluster_wl Output. The set of cluster weight limits for each group. * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters - * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be - * set to false when there is no empty cluster, true otherwise. + * @param has_nulls Whether or not the input contains nulls + * */ template @@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, double* group_cluster_wl, size_type* group_num_clusters, size_type const* group_cluster_offsets, - bool may_have_empty_clusters) + bool has_nulls) { int const tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -399,12 +399,11 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, // a group with nothing in it. group_num_clusters[group_index] = 0; if (total_weight <= 0) { - // If the input contains empty clusters, we can potentially have a group that also generates - // empty clusters because -all- of the input values are null or empty cluster. In that case, the - // `reduce_by_key` call in the tdigest generation step will need a location to store the unused - // reduction value for that group of nulls and empty clusters. These "stubs" will be - // postprocessed out afterwards. - if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; } + // if the input contains nulls we can potentially have a group that generates no + // clusters because -all- of the input values are null. in that case, the reduce_by_key call + // in the tdigest generation step will need a location to store the unused reduction value for + // that group of nulls. these "stubs" will be postprocessed out afterwards. + if (has_nulls) { group_num_clusters[group_index] = 1; } return; } @@ -503,8 +502,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, * stream that falls before our current cluster limit * @param group_info A functor which returns the info for the specified group (total weight, * size and start offset) - * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be - * set to false only when there is no empty cluster. + * @param has_nulls Whether or not the input data contains nulls * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @@ -518,7 +516,7 @@ generate_group_cluster_info(int delta, NearestWeight nearest_weight, GroupInfo group_info, CumulativeWeight cumulative_weight, - bool may_have_empty_clusters, + bool has_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -537,7 +535,7 @@ generate_group_cluster_info(int delta, nullptr, group_num_clusters.begin(), nullptr, - may_have_empty_clusters); + has_nulls); // generate group cluster offsets (where the clusters for a given group start and end) auto group_cluster_offsets = cudf::make_numeric_column( @@ -569,7 +567,7 @@ generate_group_cluster_info(int delta, group_cluster_wl.begin(), group_num_clusters.begin(), group_cluster_offsets->view().begin(), - may_have_empty_clusters); + has_nulls); return {std::move(group_cluster_wl), std::move(group_cluster_offsets), @@ -582,7 +580,7 @@ std::unique_ptr build_output_column(size_type num_rows, std::unique_ptr&& offsets, std::unique_ptr&& min_col, std::unique_ptr&& max_col, - bool may_have_empty_clusters, + bool has_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -597,7 +595,7 @@ std::unique_ptr build_output_column(size_type num_rows, size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; }; size_type const num_stubs = [&]() { - if (!may_have_empty_clusters) { return 0; } + if (!has_nulls) { return 0; } auto iter = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type(is_stub_digest)); return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows); @@ -663,10 +661,6 @@ std::unique_ptr build_output_column(size_type num_rows, mr); } -/** - * @brief A functor which returns the cluster index within a group that the value at - * the given value index falls into. - */ template struct compute_tdigests_keys_fn { int const delta; @@ -712,8 +706,8 @@ struct compute_tdigests_keys_fn { * boundaries. * * @param delta tdigest compression level - * @param centroids_begin Beginning of the range of centroids. - * @param centroids_end End of the range of centroids. + * @param values_begin Beginning of the range of input values. + * @param values_end End of the range of input values. * @param cumulative_weight Functor which returns cumulative weight and group information for * an absolute input value index. * @param min_col Column containing the minimum value per group. @@ -721,8 +715,7 @@ struct compute_tdigests_keys_fn { * @param group_cluster_wl Cluster weight limits for each group. * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits. * @param total_clusters Total number of clusters in all groups. - * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be - * set to false only when there is no empty cluster. + * @param has_nulls Whether or not the input contains nulls * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @@ -738,7 +731,7 @@ std::unique_ptr compute_tdigests(int delta, rmm::device_uvector const& group_cluster_wl, std::unique_ptr&& group_cluster_offsets, size_type total_clusters, - bool may_have_empty_clusters, + bool has_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -757,9 +750,7 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); - } + if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -802,7 +793,7 @@ std::unique_ptr compute_tdigests(int delta, std::move(group_cluster_offsets), std::move(min_col), std::move(max_col), - may_have_empty_clusters, + has_nulls, stream, mr); } @@ -1154,13 +1145,8 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto merged = cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref()); - auto merged_weights = merged->get_column(1).view(); - // If there are no values, we can simply return a column that has only empty tdigests. - if (merged_weights.size() == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr); - } - // generate cumulative weights + auto merged_weights = merged->get_column(1).view(); auto cumulative_weights = cudf::make_numeric_column( data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream); auto keys = cudf::detail::make_counting_transform_iterator( @@ -1175,10 +1161,6 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto const delta = max_centroids; - // We do not know whether there is any empty cluster in the input without actually reading the - // data, which could be expensive. So, we just assume that there could be empty clusters. - auto const may_have_empty_clusters = true; - // generate cluster info auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, @@ -1195,7 +1177,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_labels, group_offsets, {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, - may_have_empty_clusters, + false, stream, mr); @@ -1220,7 +1202,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_cluster_wl, std::move(group_cluster_offsets), total_clusters, - may_have_empty_clusters, + false, stream, mr); } @@ -1285,9 +1267,7 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (col.size() == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); - } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1313,7 +1293,7 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); + return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } // bring group offsets back to the host diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,15 +539,26 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -959,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 3b899e4edc1..e34a1e12015 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +constexpr bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: - if (pos == 0) break; - if (jnk.startchar != '^') { return cuda::std::nullopt; } + case BOL: { + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; } + if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { auto const find_itr = find_char(startchar, dstr, itr); if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; } @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } break; - case EOL: + } + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (c != '\n') id_activate = inst.u2.next_id; - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: case CCLASS: { diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index 978a844c476..4c39fc96397 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings, break; } size_type const cc = (itr < end) && is_begin_utf8_char(*itr); - size_type const bc = (itr < end); + size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0; char_count += cg::reduce(warp, cc, cg::plus()); byte_count += cg::reduce(warp, bc, cg::plus()); itr += cudf::detail::warp_size; } + __syncwarp(); + if (warp.thread_rank() == 0) { if (start >= char_count) { d_output[str_idx] = string_index_pair{"", 0}; return; } - // we are just below start/stop and must now increment up to it from here + // we are just below start/stop and must now increment up to them from here auto first_byte = start_counts.second; if (start_counts.first < start) { auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte); first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first)); } - stop = max(stop, char_count); + stop = min(stop, char_count); auto last_byte = stop_counts.second; if (stop_counts.first < stop) { auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 605582f28a6..a03a34f5fa7 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -151,15 +153,111 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( *d_strings, seeds, width, d_hashes); return hashes; } -std::unique_ptr build_list_result(cudf::strings_column_view const& input, +/** + * @brief Compute the minhash of each list row of strings for each seed + * + * This is a warp-per-row algorithm where parallel threads within a warp + * work on strings in a single list row. + * + * @tparam HashFunction hash function to use on each string + * + * @param d_input List of strings to process + * @param seeds Seeds for hashing each string + * @param d_hashes Minhash output values (one per row) + */ +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, + cudf::device_span seeds, + hash_value_type* d_hashes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = idx / cudf::detail::warp_size; + + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const d_output = d_hashes + (row_idx * seeds.size()); + + // initialize hashes output for this row + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + if (lane_idx == 0) { + auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + } + __syncwarp(); + + // each lane hashes a string from the input row + for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { + auto const hash_str = + d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = HashFunction(seeds[seed_idx]); + // hash string and store the min value + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. + hv = thrust::get<0>(hasher(hash_str)); + } + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hv, cuda::std::memory_order_relaxed); + } + } +} + +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + auto lcdv = cudf::detail::lists_column_device_view(*d_input); + + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + minhash_word_kernel + <<>>(lcdv, seeds, d_hashes); + + return hashes; +} + +std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, rmm::cuda_stream_view stream, @@ -176,7 +274,7 @@ std::unique_ptr build_list_result(cudf::strings_column_view const& std::move(offsets), std::move(hashes), input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input, stream, mr), stream, mr); // expect this condition to be very rare @@ -208,7 +306,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -232,7 +330,27 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } } // namespace detail @@ -276,4 +394,21 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash(input, seeds, stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash64(input, seeds, stream, mr); +} } // namespace nvtext diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d3a7ce5a4e..9824c472b20 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -132,6 +132,13 @@ struct cuda_event { cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } + // Moveable but not copyable. + cuda_event(const cuda_event&) = delete; + cuda_event& operator=(const cuda_event&) = delete; + + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + operator cudaEvent_t() { return e_; } private: @@ -147,11 +154,12 @@ struct cuda_event { */ cudaEvent_t event_for_thread() { - thread_local std::vector> thread_events(get_num_cuda_devices()); + // The program may crash if this function is called from the main thread and user application + // subsequently calls cudaDeviceReset(). + // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. + thread_local std::vector thread_events(get_num_cuda_devices()); auto const device_id = get_current_cuda_device(); - if (not thread_events[device_id.value()]) { - thread_events[device_id.value()] = std::make_unique(); - } + if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } return *thread_events[device_id.value()]; } diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 3780dbb1d95..baa59026b07 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups) cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0}; int const delta = 1000; - auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto b = cudf::type_dispatcher( static_cast(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta); - auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto d = cudf::type_dispatcher( static_cast(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta); - auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto e = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); @@ -507,81 +507,3 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } - -std::unique_ptr do_agg( - cudf::column_view key, - cudf::column_view val, - std::function()> make_agg) -{ - std::vector keys; - keys.push_back(key); - cudf::table_view const key_table(keys); - - cudf::groupby::groupby gb(key_table); - std::vector requests; - cudf::groupby::aggregation_request req; - req.values = val; - req.aggregations.push_back(make_agg()); - requests.push_back(std::move(req)); - - auto result = gb.aggregate(std::move(requests)); - - std::vector> result_columns; - for (auto&& c : result.first->release()) { - result_columns.push_back(std::move(c)); - } - - EXPECT_EQ(result.second.size(), 1); - EXPECT_EQ(result.second[0].results.size(), 1); - result_columns.push_back(std::move(result.second[0].results[0])); - - return std::make_unique(std::move(result_columns)); -} - -TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters) -{ - // The input must be sorted by the key. - // See `aggregate_result_functor::operator()` for details. - auto const keys = cudf::test::fixed_width_column_wrapper{{0, 0, 1, 1, 2}}; - auto const keys_view = cudf::column_view(keys); - auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); - auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - // All values are null - return false; - }); - auto const vals = cudf::test::fixed_width_column_wrapper{ - val_elems, val_elems + keys_view.size(), val_valids}; - - auto const delta = 10000; - - // Compute tdigest. The result should have 3 empty clusters, one per group. - auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() { - return cudf::make_tdigest_aggregation(delta); - }); - - auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; - cudf::column_view const expected_computed_keys_view{expected_computed_keys}; - auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - expected_computed_keys_view.size(), - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view()); - // The computed values are nullable even though the input values are not. - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(), - compute_result->get_column(1).view()); - - // Merge tdigest. The result should have 3 empty clusters, one per group. - auto const merge_result = - do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { - return cudf::make_merge_tdigest_aggregation(delta); - }); - - auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; - cudf::column_view const expected_merged_keys_view{expected_merged_keys}; - auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - expected_merged_keys_view.size(), - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view()); -} diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 960c19fce2e..48bc982d0e3 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, JsonDtypeSchema) +{ + std::string data = R"( + {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {"0": "lolol "}, "c": true} + )"; + + std::map dtype_schema{{"c", {data_type{type_id::STRING}}}, + {"b", {data_type{type_id::STRING}}}, + {"a", {dtype()}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .prune_columns(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 3); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + + // cudf::column::contents contents = result.tbl->get_column(1).release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), + cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}", + "{\"0\": \"abc\" }", + "{\"0\": \"lolol \"}"}), + cudf::test::debug_output_level::ALL_ERRORS); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), + cudf::test::strings_column_wrapper({"true", "false", "true"}), + cudf::test::debug_output_level::ALL_ERRORS); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu index 6d79fdc98ef..6a3bd69de81 100644 --- a/cpp/tests/io/json/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu @@ -34,129 +34,127 @@ // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) -{ - // Prepare cuda stream for data transfers & kernels - auto stream_view = cudf::test::get_default_stream(); - - auto device_input = rmm::device_buffer( - host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref()); - - // Preprocessing FST - cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_whitespace( - device_data, stream_view, cudf::get_current_device_resource_ref()); - - std::string preprocessed_host_output(device_data.size(), 0); - CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), - device_data.data(), - preprocessed_host_output.size(), - cudaMemcpyDeviceToHost, - stream_view.value())); - - stream_view.synchronize(); - ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL( - preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); -} - -TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) +TEST_F(JsonWSNormalizationTest, ReadJsonOption) { - std::string input = R"({ "A" : "TEST" })"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces) -{ - std::string input = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})"; - std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})"; - run_test(input, output); -} + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString) -{ - std::string input = R"({" a ":50})"; - std::string output = R"({" a ":50})"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString) -{ - std::string input = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}"; - std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); -TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs) -{ - std::string input = "{\"a\":\t\"b\"}"; - std::string output = R"({"a":"b"})"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows) { - std::string input = "{\"A\" : \t\"TEST\" }"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs) -{ - std::string input = - "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, " - "\"bar\trapids\": 456 }"; - std::string output = - "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}"; - run_test(input, output); -} + // Test input + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" } ] } } + )"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample) -{ - std::string input = R"([{"a":50}, {"a" : 60}])"; - std::string output = R"([{"a":50},{"a":60}])"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired) -{ - std::string input = R"({"a\\n\r\a":50})"; - std::string output = R"({"a\\n\r\a":50})"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"}] } } + )"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) -{ - std::string input = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}"; - std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, ReadJsonOption) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType) { // When mixed type fields are read as strings, the table read will differ depending the // value of normalize_whitespace // Test input - std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" }, { "YY" : "abc" } ] } } + { "Root": { "Key": [ { "YY" : "abc" } ] } } + )"; + + std::map dtype_schema{ + {"Key", {cudf::data_type{cudf::type_id::STRING}}}}; + cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(true); + .prune_columns(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); // Expected table - std::string const expected_input = R"({ "a" : {"b":"c"}})"; + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } , { "YY" : 2 } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } } + { "Root": { "Key": [{"YY":"abc"}] } } + )"; + cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(false); + .prune_columns(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 7359f0406fc..915717713df 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {}; TEST_F(PercentileApproxTest, EmptyInput) { - auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; std::vector input; diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c816316d0ff..acf850c7a66 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq\rzzé" LINE_SEPARATOR "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); + + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include +#include #include #include @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé\rlll", + "zzé", + "", + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", + "zzé", + "", + "zzé\r", + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\r_\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc\r[zzé]\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp index 52e439bd93f..7f7fd9d521b 100644 --- a/cpp/tests/strings/slice_tests.cpp +++ b/cpp/tests/strings/slice_tests.cpp @@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsSliceTest, MultiByteChars) +{ + auto input = cudf::test::strings_column_wrapper({ + // clang-format off + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "the following code snippet demonstrates how to use search for values in an ordered range " + // this placement tests proper multi-byte chars handling ------vvvvv + "it returns the last position where value could be inserted without the ééééé ordering ", + "algorithms execution is parallelized as determined by an execution policy; this is a 12345" + "continuation of previous row to make sure string boundaries are honored 012345678901234567" + // v--- this one also + "01234567890é34567890012345678901234567890" + // clang-format on + }); + + auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); +} + TEST_F(StringsSliceTest, Error) { cudf::test::strings_column_wrapper strings{"this string intentionally left blank"}; diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7575a3ba846..e23f3f6e7d8 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, WordsMinHash) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto validity = cudf::test::iterators::null_at(1); + + LCWS input( + {LCWS({"hello", "abcdéfgh"}), + LCWS{}, + LCWS({"rapids", "moré", "test", "text"}), + LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, + validity); + + auto view = cudf::lists_column_view(input); + + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); + using LCW32 = cudf::test::lists_column_wrapper; + LCW32 expected({LCW32{2069617641u, 1975382903u}, + LCW32{}, + LCW32{657297235u, 1010955999u}, + LCW32{644643885u, 310002789u}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); + auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); + using LCW64 = cudf::test::lists_column_wrapper; + LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, + LCW64{}, + LCW64{5331949571924938590ul, 2088583894581919741ul}, + LCW64{3400468157617183341ul, 2398577492366130055ul}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); diff --git a/dependencies.yaml b/dependencies.yaml index 483335c02ff..7a13043cc5f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -710,7 +710,16 @@ dependencies: - numpy==1.23.* - pandas==2.0.* - pyarrow==14.0.0 - - cupy==12.0.0 # ignored as pip constraint + - matrix: + packages: + - output_types: conda + matrices: + - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"} + packages: + - cupy==12.2.0 # cupy 12.2.0 is the earliest with CUDA 12 ARM packages. + - matrix: {dependencies: "oldest"} + packages: + - cupy==12.0.0 - matrix: packages: - output_types: requirements diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 0398a8d7086..41838e01dd9 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the ```bash python -m cudf.pandas --profile script.py ``` + +### cudf.pandas CLI Features + +Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin. + +Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter. + +The `-c` flag accepts a code string to run, like this: + +```bash +$ python -m cudf.pandas -c "import pandas; print(pandas)" + +``` + +Users can also provide code to execute from stdin, like this: + +```bash +$ echo "import pandas; print(pandas)" | python -m cudf.pandas + +``` diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 2eaa75b3189..95f5f9734dd 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -5,9 +5,9 @@ "id": "4c6c548b", "metadata": {}, "source": [ - "# 10 Minutes to cuDF and Dask-cuDF\n", + "# 10 Minutes to cuDF and Dask cuDF\n", "\n", - "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n", + "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n", "\n", "## What are these Libraries?\n", "\n", @@ -18,13 +18,14 @@ "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n", "\n", "\n", - "> [!NOTE] \n", - "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n", + "
\n", + "Note: This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's configuration infrastructure to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the Dask cuDF documentation for more information.\n", + "
\n", "\n", "\n", - "## When to use cuDF and Dask-cuDF\n", + "## When to use cuDF and Dask cuDF\n", "\n", - "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF." + "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF." ] }, { @@ -115,7 +116,7 @@ "source": [ "ds = dask_cudf.from_cudf(s, npartitions=2)\n", "# Note the call to head here to show the first few entries, unlike\n", - "# cuDF objects, dask-cuDF objects do not have a printing\n", + "# cuDF objects, Dask-cuDF objects do not have a printing\n", "# representation that shows values since they may not be in local\n", "# memory.\n", "ds.head(n=3)" @@ -331,11 +332,11 @@ "id": "b17db919", "metadata": {}, "source": [ - "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n", + "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n", "\n", - "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n", + "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n", "\n", - "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*" + "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*" ] }, { @@ -1680,7 +1681,7 @@ "id": "7aa0089f", "metadata": {}, "source": [ - "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)." + "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)." ] }, { @@ -2393,7 +2394,7 @@ "id": "f6094cbe", "metadata": {}, "source": [ - "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." + "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." ] }, { @@ -3492,7 +3493,7 @@ "id": "5ac3b004", "metadata": {}, "source": [ - "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF." + "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF." ] }, { @@ -4181,7 +4182,7 @@ "id": "aa8a445b", "metadata": {}, "source": [ - "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert." + "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert." ] }, { @@ -4899,7 +4900,7 @@ "id": "787eae14", "metadata": {}, "source": [ - "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU." + "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU." ] }, { diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst new file mode 100644 index 00000000000..06f74a38709 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst @@ -0,0 +1,6 @@ +======= +extract +======= + +.. automodule:: pylibcudf.strings.extract + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..2518afc80a7 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,8 +7,10 @@ strings capitalize char_types contains + extract find regex_flags regex_program + repeat replace slice diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst new file mode 100644 index 00000000000..0041fe4c3da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst @@ -0,0 +1,6 @@ +====== +repeat +====== + +.. automodule:: pylibcudf.strings.repeat + :members: diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 9a216690384..7fe6cbd45fa 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -3,39 +3,42 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to dask-cudf's documentation! +Welcome to Dask cuDF's documentation! ===================================== -**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension +**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension library for the `Dask `__ parallel computing -framework that provides a `cuDF -`__-backed distributed -dataframe with the same API as `Dask dataframes -`__. +framework. When installed, Dask cuDF is automatically registered +as the ``"cudf"`` dataframe backend for +`Dask DataFrame `__. + +.. note:: + Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU + or multi-node execution on their own. You must also deploy a + `dask.distributed ` cluster + to leverage multiple GPUs. We strongly recommend using `Dask-CUDA + `__ to simplify the + setup of the cluster, taking advantage of all features of the GPU + and networking hardware. If you are familiar with Dask and `pandas `__ or -`cuDF `__, then Dask-cuDF +`cuDF `__, then Dask cuDF should feel familiar to you. If not, we recommend starting with `10 minutes to Dask `__ followed -by `10 minutes to cuDF and Dask-cuDF +by `10 minutes to cuDF and Dask cuDF `__. -When running on multi-GPU systems, `Dask-CUDA -`__ is recommended to -simplify the setup of the cluster, taking advantage of all features of -the GPU and networking hardware. -Using Dask-cuDF +Using Dask cuDF --------------- -When installed, Dask-cuDF registers itself as a dataframe backend for -Dask. This means that in many cases, using cuDF-backed dataframes requires -only small changes to an existing workflow. The minimal change is to -select cuDF as the dataframe backend in :doc:`Dask's -configuration `. To do so, we must set the option -``dataframe.backend`` to ``cudf``. From Python, this can be achieved -like so:: +The Dask DataFrame API (Recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Simply use the `Dask configuration ` system to +set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, +this can be achieved like so:: import dask @@ -44,52 +47,157 @@ like so:: Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the environment before running your code. -Dataframe creation from on-disk formats -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If your workflow creates Dask dataframes from on-disk formats -(for example using :func:`dask.dataframe.read_parquet`), then setting -the backend may well be enough to migrate your workflow. - -For example, consider reading a dataframe from parquet:: +Once this is done, the public Dask DataFrame API will leverage +``cudf`` automatically when a new DataFrame collection is created +from an on-disk format using any of the following ``dask.dataframe`` +functions:: - import dask.dataframe as dd +* :func:`dask.dataframe.read_parquet` +* :func:`dask.dataframe.read_json` +* :func:`dask.dataframe.read_csv` +* :func:`dask.dataframe.read_orc` +* :func:`dask.dataframe.read_hdf` +* :func:`dask.dataframe.from_dict` - # By default, we obtain a pandas-backed dataframe - df = dd.read_parquet("data.parquet", ...) +For example:: + import dask.dataframe as dd -To obtain a cuDF-backed dataframe, we must set the -``dataframe.backend`` configuration option:: + # By default, we obtain a pandas-backed dataframe + df = dd.read_parquet("data.parquet", ...) import dask - import dask.dataframe as dd dask.config.set({"dataframe.backend": "cudf"}) - # This gives us a cuDF-backed dataframe + # This now gives us a cuDF-backed dataframe df = dd.read_parquet("data.parquet", ...) -This code will use cuDF's GPU-accelerated :func:`parquet reader -` to read partitions of the data. +When other functions are used to create a new collection +(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`, +and :func:`from_array`), the backend of the new collection will +depend on the inputs to those functions. For example:: + + import pandas as pd + import cudf + + # This gives us a pandas-backed dataframe + dd.from_pandas(pd.DataFrame({"a": range(10)})) + + # This gives us a cuDF-backed dataframe + dd.from_pandas(cudf.DataFrame({"a": range(10)})) + +An existing collection can always be moved to a specific backend +using the :func:`dask.dataframe.DataFrame.to_backend` API:: + + # This ensures that we have a cuDF-backed dataframe + df = df.to_backend("cudf") + + # This ensures that we have a pandas-backed dataframe + df = df.to_backend("pandas") + +The explicit Dask cuDF API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to providing the ``"cudf"`` backend for Dask DataFrame, +Dask cuDF also provides an explicit ``dask_cudf`` API:: + + import dask_cudf + + # This always gives us a cuDF-backed dataframe + df = dask_cudf.read_parquet("data.parquet", ...) + +This API is used implicitly by the Dask DataFrame API when the ``"cudf"`` +backend is enabled. Therefore, using it directly will not provide any +performance benefit over the CPU/GPU-portable ``dask.dataframe`` API. +Also, using some parts of the explicit API are incompatible with +automatic query planning (see the next section). + +The explicit Dask cuDF API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). +As long as the ``"dataframe.query-planning"`` configuration is set to +``True`` (the default) when ``dask.dataframe`` is first imported, `Dask +Expressions `__ will be used under the hood. + +For example, the following code will automatically benefit from predicate +pushdown when the result is computed:: + + df = dd.read_parquet("/my/parquet/dataset/") + result = df.sort_values('B')['A'] + +Unoptimized expression graph (``df.pprint()``):: + + Projection: columns='A' + SortValues: by=['B'] shuffle_method='tasks' options={} + ReadParquetFSSpec: path='/my/parquet/dataset/' ... + +Simplified expression graph (``df.simplify().pprint()``):: + + Projection: columns='A' + SortValues: by=['B'] shuffle_method='tasks' options={} + ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ... + +.. note:: + Dask will automatically simplify the expression graph (within + :func:`optimize`) when the result is converted to a task graph + (via :func:`compute` or :func:`persist`). You do not need to call + :func:`simplify` yourself. + + +Using Multiple GPUs and Multiple Nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try +to partition your data into small-enough tasks to fit comfortably in the +memory of a single GPU. This means the necessary compute tasks needed to +compute a query can often be streamed to a single GPU process for +out-of-core computing. This also means that the compute tasks can be +executed in parallel over a multi-GPU cluster. + +In order to execute your Dask workflow on multiple GPUs, you will +typically need to use `Dask-CUDA `__ +to deploy distributed Dask cluster, and +`Distributed `__ +to define a client object. For example:: + + from dask_cuda import LocalCUDACluster + from distributed import Client + + if __name__ == "__main__": + + client = Client( + LocalCUDACluster( + CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) + rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations + enable_cudf_spill=True, # Improve device memory stability + local_directory="/fast/scratch/", # Use fast local storage for spilling + ) + ) + + df = dd.read_parquet("/my/parquet/dataset/") + agg = df.groupby('B').sum() + agg.compute() # This will use the cluster defined above + +.. note:: + This example uses :func:`compute` to materialize a concrete + ``cudf.DataFrame`` object in local memory. Never call :func:`compute` + on a large collection that cannot fit comfortably in the memory of a + single GPU! See Dask's `documentation on managing computation + `__ + for more details. -Dataframe creation from in-memory formats -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Please see the `Dask-CUDA `__ +documentation for more information about deploying GPU-aware clusters +(including `best practices +`__). -If you already have a dataframe in memory and want to convert it to a -cuDF-backend one, there are two options depending on whether the -dataframe is already a Dask one or not. If you have a Dask dataframe, -then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"`` -as the backend; if you have a pandas dataframe then you can either -call :func:`dask.dataframe.from_pandas` followed by -:func:`~dask.dataframe.to_backend` or first convert the dataframe with -:func:`cudf.from_pandas` and then parallelise this with -:func:`dask_cudf.from_cudf`. API Reference ------------- -Generally speaking, Dask-cuDF tries to offer exactly the same API as -Dask itself. There are, however, some minor differences mostly because +Generally speaking, Dask cuDF tries to offer exactly the same API as +Dask DataFrame. There are, however, some minor differences mostly because cuDF does not :doc:`perfectly mirror ` the pandas API, or because cuDF provides additional configuration flags (these mostly occur in data reading and writing interfaces). @@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces). As a result, straightforward workflows can be migrated without too much trouble, but more complex ones that utilise more features may need a bit of tweaking. The API documentation describes details of the -differences and all functionality that Dask-cuDF supports. +differences and all functionality that Dask cuDF supports. .. toctree:: :maxdepth: 2 diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 5a0fbd224ad..6a0f0f6f169 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount, od, vd, nullCount, numRows, childHandles); } - static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { + /** + * Creates a ColumnVector from a native column_view using a contiguous device allocation. + * + * @param columnViewAddress address of the native column_view + * @param buffer device buffer containing the data referenced by the column view + */ + public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { return new ColumnVector(columnViewAddress, buffer); } diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java index 7b58817550d..8c8180436e6 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,9 +24,13 @@ */ public final class ParquetWriterOptions extends CompressionMetadataWriterOptions { private final StatisticsFrequency statsGranularity; + private int rowGroupSizeRows; + private long rowGroupSizeBytes; private ParquetWriterOptions(Builder builder) { super(builder); + this.rowGroupSizeRows = builder.rowGroupSizeRows; + this.rowGroupSizeBytes = builder.rowGroupSizeBytes; this.statsGranularity = builder.statsGranularity; } @@ -51,18 +55,38 @@ public static Builder builder() { return new Builder(); } + public int getRowGroupSizeRows() { + return rowGroupSizeRows; + } + + public long getRowGroupSizeBytes() { + return rowGroupSizeBytes; + } + public StatisticsFrequency getStatisticsFrequency() { return statsGranularity; } public static class Builder extends CompressionMetadataWriterOptions.Builder { + private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group + private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP; public Builder() { super(); } + public Builder withRowGroupSizeRows(int rowGroupSizeRows) { + this.rowGroupSizeRows = rowGroupSizeRows; + return this; + } + + public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) { + this.rowGroupSizeBytes = rowGroupSizeBytes; + return this; + } + public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) { this.statsGranularity = statsGranularity; return this; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b250d2d5e8b..115aa73ed95 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -334,20 +334,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames, /** * Setup everything to write parquet formatted data to a file. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param filename local output path + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param filename local output path * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetFileBegin(String[] columnNames, @@ -357,6 +359,8 @@ private static native long writeParquetFileBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -368,20 +372,22 @@ private static native long writeParquetFileBegin(String[] columnNames, /** * Setup everything to write parquet formatted data to a buffer. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param consumer consumer of host buffers produced. + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetBufferBegin(String[] columnNames, @@ -391,6 +397,8 @@ private static native long writeParquetBufferBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -1834,6 +1842,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), @@ -1854,6 +1864,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index fa10ebf03b8..edc40a315ac 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2156,6 +2156,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2211,6 +2213,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) @@ -2233,6 +2237,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2286,6 +2292,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 56fe63598d9..830f2b33b32 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -9122,7 +9122,11 @@ void testParquetWriteToBufferChunked() { columns.add(Columns.STRUCT.name); WriteUtils.buildWriterOptions(optBuilder, columns); ParquetWriterOptions options = optBuilder.build(); - ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build(); + ParquetWriterOptions optionsNoCompress = + optBuilder.withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) + .build(); try (Table table0 = getExpectedFileTable(columns); MyBufferConsumer consumer = new MyBufferConsumer()) { try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { @@ -9208,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { .withDecimalColumn("_c7", 4) .withDecimalColumn("_c8", 6) .withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini index db24415ef9e..187d91996b2 100644 --- a/python/cudf/benchmarks/pytest.ini +++ b/python/cudf/benchmarks/pytest.ini @@ -6,3 +6,4 @@ python_classes = Bench python_functions = bench_* markers = pandas_incompatible: mark a benchmark that cannot be run with pandas +addopts = --tb=native diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index e661059faa3..e6c2d136f0d 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -23,9 +23,9 @@ def concat_columns(object columns): def concat_tables(object tables, bool ignore_index=False): plc_tables = [] for table in tables: - cols = table._data.columns + cols = table._columns if not ignore_index: - cols = table._index._data.columns + cols + cols = table._index._columns + cols plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) return data_from_pylibcudf_table( diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 16182e31c08..49714091f46 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -384,7 +384,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} - for name, col in input_table._data.items(): + for name, col in input_table._column_labels_and_values: if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 058e884e08b..9ad96f610b3 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -273,7 +273,7 @@ def read_csv( elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._data.names[index] + col_name = df._column_names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) if names is not None and len(names) and isinstance(names[0], int): diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index b1900138d94..564daefbae2 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -179,7 +179,7 @@ cdef update_struct_field_names( ): # Deprecated, remove in favor of add_col_struct_names # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._data.items()): + for i, (name, col) in enumerate(table._column_labels_and_values): table._data[name] = update_column_struct_field_names( col, schema_info[i] ) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5ee15d0e409..59cb8d51440 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, ) from pylibcudf.libcudf.types cimport size_type @@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width): ) return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash64(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index a0155671a26..fa2690c7f21 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -235,16 +235,16 @@ cdef object _process_metadata(object df, df._index = idx elif set(index_col).issubset(names): index_data = df[index_col] - actual_index_names = list(index_col_names.values()) - if len(index_data._data) == 1: + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: idx = cudf.Index._from_column( - index_data._data.columns[0], - name=actual_index_names[0] + index_data._columns[0], + name=next(actual_index_names) ) else: idx = cudf.MultiIndex.from_frame( index_data, - names=actual_index_names + names=list(actual_index_names) ) df.drop(columns=index_col, inplace=True) df._index = idx @@ -252,7 +252,7 @@ cdef object _process_metadata(object df, if use_pandas_metadata: df.index.names = index_col - if len(df._data.names) == 0 and column_index_type is not None: + if df._num_columns == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) return df @@ -438,7 +438,7 @@ def write_parquet( object statistics="ROWGROUP", object metadata_file_path=None, object int96_timestamps=False, - object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + object row_group_size_bytes=None, object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, @@ -616,9 +616,9 @@ cdef class ParquetWriter: Name of the compression to use. Use ``None`` for no compression. statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' Level at which column statistics should be included in file. - row_group_size_bytes: int, default 134217728 + row_group_size_bytes: int, default ``uint64 max`` Maximum size of each stripe of the output. - By default, 134217728 (128MB) will be used. + By default, a virtually infinite size equal to ``uint64 max`` will be used. row_group_size_rows: int, default 1000000 Maximum number of rows of each stripe of the output. By default, 1000000 (10^6 rows) will be used. @@ -661,11 +661,11 @@ cdef class ParquetWriter: def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", - int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - int row_group_size_rows=1000000, - int max_page_size_bytes=524288, - int max_page_size_rows=20000, - int max_dictionary_size=1048576, + size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + size_type row_group_size_rows=1000000, + size_t max_page_size_bytes=524288, + size_type max_page_size_rows=20000, + size_t max_dictionary_size=1048576, bool use_dictionary=True, bool store_schema=False): filepaths_or_buffers = ( diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 47a194c4fda..4bf8a9b1a8f 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, @@ -6,7 +6,12 @@ hash_character_ngrams, ) from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import minhash, minhash64 +from cudf._lib.nvtext.minhash import ( + minhash, + minhash64, + word_minhash, + word_minhash64, +) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 82f5e06c547..03b4887f200 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,27 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.contains cimport ( - count_re as cpp_count_re, - like as cpp_like, - matches_re as cpp_matches_re, -) -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar from pylibcudf.strings import contains from pylibcudf.strings.regex_program import RegexProgram @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with count of occurrences of `reg_ex` in each string of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_count_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with each value True if the string matches `reg_ex` regular expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_matches_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape): Returns a Column with each value True if the string matches the `py_pattern` like expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef DeviceScalar pattern = py_pattern.device_value - cdef DeviceScalar escape = py_escape.device_value - - cdef const string_scalar* scalar_ptn = ( - pattern.get_raw_ptr() - ) - cdef const string_scalar* scalar_esc = ( - escape.get_raw_ptr() + plc_column = contains.like( + source_strings.to_pylibcudf(mode="read"), + py_pattern.device_value.c_value, + py_escape.device_value.c_value, ) - - with nogil: - c_result = move(cpp_like( - source_view, - scalar_ptn[0], - scalar_esc[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 63f4d57e562..5bf336f4f3c 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,21 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags): The returning data contains one row for each subject string, and one column for each group. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_extract( - source_view, - dereference(c_prog) - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) + plc_result = plc.strings.extract.extract( + source_strings.to_pylibcudf(mode="read"), prog ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx index 42fcfa5d94e..43649d4defe 100644 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -1,17 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def repeat_scalar(Column source_strings, @@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repeats_view = repeats.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cae28d02ef4..8660cca9322 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*: If True, don't include the index in the columns. """ return table_view_from_columns( - tbl._index._data.columns + tbl._data.columns + tbl._index._columns + tbl._columns if not ignore_index and tbl._index is not None - else tbl._data.columns + else tbl._columns ) @@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index): index_descriptors = [] columns_to_convert = list(table._columns) # Columns - for name, col in table._data.items(): + for name, col in table._column_labels_and_values: if cudf.get_option("mode.pandas_compatible"): # in pandas-compat mode, non-string column names are stringified. col_names.append(str(name)) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ff114474aa4..a6abd63d042 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1951,7 +1951,7 @@ def drop_duplicates( return self._from_columns_like_self( drop_duplicates( list(self._columns), - keys=range(len(self._data)), + keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, ), diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 16e6908f308..4463e3280df 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -623,11 +623,9 @@ def extract( "unsupported value for `flags` parameter" ) - data, _ = libstrings.extract(self._column, pat, flags) + data = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: - data = next(iter(data.values())) - else: - data = data + _, data = data.popitem() return self._return_or_inplace(data, expand=expand) def contains( @@ -5349,6 +5347,76 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + >>> ls.str.word_minhash(seeds=seeds) + 0 [21141582, 1232889953, 1268336794] + 1 [962346254, 2321233602, 1354839212] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash(self._column, seeds_column) + ) + + def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + This function generates 2 uint64 values but only the first + uint64 value is used. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + >>> ls.str.word_minhash64(seeds) + 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] + 1 [5240044617220523711, 5847101123925041457, 153762819128779913] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash64(self._column, seeds_column) + ) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 09b0f453692..bc093fdaa9a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: self.set_by_label(key, value) def __delitem__(self, key: abc.Hashable) -> None: - old_ncols = len(self._data) + old_ncols = len(self) del self._data[key] - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def __len__(self) -> int: @@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]: @property def nlevels(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 if not self.multiindex: return 1 @@ -226,7 +226,7 @@ def name(self) -> abc.Hashable: @cached_property def nrows(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 else: return len(next(iter(self.values()))) @@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: Parameters ---------- old_ncols: int - len(self._data) before self._data was modified + len(self) before self._data was modified new_ncols: int - len(self._data) after self._data was modified + len(self) after self._data was modified """ cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: @@ -335,7 +335,7 @@ def insert( if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") - old_ncols = len(self._data) + old_ncols = len(self) if loc == -1: loc = old_ncols elif not (0 <= loc <= old_ncols): @@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple: tuple """ if isinstance(index, slice): - start, stop, step = index.indices(len(self._data)) + start, stop, step = index.indices(len(self)) return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) @@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: if len(self) > 0 and len(value) != self.nrows: raise ValueError("All columns must be of equal length") - old_ncols = len(self._data) + old_ncols = len(self) self._data[key] = value - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def _select_by_label_list_like(self, key: tuple) -> Self: @@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None: if level < 0: level += self.nlevels - old_ncols = len(self._data) + old_ncols = len(self) self._data = { _remove_key_level(key, level): value # type: ignore[arg-type] for key, value in self._data.items() } - new_ncols = len(self._data) + new_ncols = len(self) self._level_names = ( self._level_names[:level] + self._level_names[level + 1 :] ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58a16a6d504..16b0aa95c35 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg): return False @_performance_tracking - def _downcast_to_series(self, df, arg): + def _downcast_to_series(self, df: DataFrame, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules @@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg): # take series along the axis: if axis == 1: - return df[df._data.names[0]] + return df[df._column_names[0]] else: if df._num_columns > 0: dtypes = df.dtypes.values.tolist() normalized_dtype = np.result_type(*dtypes) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: df[name] = col.astype(normalized_dtype) sr = df.T - return sr[sr._data.names[0]] + return sr[sr._column_names[0]] class _DataFrameLocIndexer(_DataFrameIndexer): @@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg): and len(arg) > 1 and is_scalar(arg[1]) ): - return result._data.columns[0].element_indexing(0) + return result._columns[0].element_indexing(0) return result else: if isinstance(arg[0], slice): @@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg): else: tmp_col_name = str(uuid4()) cantor_name = "_" + "_".join( - map(str, columns_df._data.names) + map(str, columns_df._column_names) ) if columns_df._data.multiindex: # column names must be appropriate length tuples @@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value): else column.column_empty_like( col, masked=True, newsize=length ) - for key, col in self._data.items() + for key, col in self._column_labels_and_values ) self._data = self._data._from_columns_like_self( new_columns, verify=False @@ -1494,8 +1494,8 @@ def __delitem__(self, name): @_performance_tracking def memory_usage(self, index=True, deep=False) -> cudf.Series: - mem_usage = [col.memory_usage for col in self._data.columns] - names = [str(name) for name in self._data.names] + mem_usage = [col.memory_usage for col in self._columns] + names = [str(name) for name in self._column_names] if index: mem_usage.append(self.index.memory_usage()) names.append("Index") @@ -1725,7 +1725,7 @@ def _concat( [] if are_all_range_index or (ignore_index and not empty_has_index) - else list(f.index._data.columns) + else list(f.index._columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs @@ -1808,7 +1808,7 @@ def _concat( out.index.dtype, cudf.CategoricalDtype ): out = out.set_index(out.index) - for name, col in out._data.items(): + for name, col in out._column_labels_and_values: out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype ) @@ -1831,13 +1831,13 @@ def astype( errors: Literal["raise", "ignore"] = "raise", ): if is_dict_like(dtype): - if len(set(dtype.keys()) - set(self._data.names)) > 0: + if len(set(dtype.keys()) - set(self._column_names)) > 0: raise KeyError( "Only a column name can be used for the " "key in a dtype mappings argument." ) else: - dtype = {cc: dtype for cc in self._data.names} + dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) def _clean_renderable_dataframe(self, output): @@ -2601,7 +2601,7 @@ def equals(self, other) -> bool: # If all other checks matched, validate names. if ret: for self_name, other_name in zip( - self._data.names, other._data.names + self._column_names, other._column_names ): if self_name != other_name: ret = False @@ -2676,7 +2676,7 @@ def columns(self, columns): ) self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._data.columns)), + data=dict(zip(pd_columns, self._columns)), multiindex=multiindex, level_names=level_names, label_dtype=label_dtype, @@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: f"got {len(self)} elements" ) self._data = ColumnAccessor( - data=dict(zip(other.names, self._data.columns)), + data=dict(zip(other.names, self._columns)), multiindex=other.multiindex, rangeindex=other.rangeindex, level_names=other.level_names, @@ -2983,7 +2983,7 @@ def set_index( elif isinstance(col, (MultiIndex, pd.MultiIndex)): if isinstance(col, pd.MultiIndex): col = MultiIndex.from_pandas(col) - data_to_add.extend(col._data.columns) + data_to_add.extend(col._columns) names.extend(col.names) elif isinstance( col, (cudf.Series, cudf.Index, pd.Series, pd.Index) @@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): ) out = [] - for (name, col), other_col in zip(self._data.items(), other_cols): + for (name, col), other_col in zip( + self._column_labels_and_values, other_cols + ): source_col, other_col = _check_and_cast_columns_with_other( source_col=col, other=other_col, @@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): column.column_empty_like( col_data, masked=True, newsize=length ) - for col_data in self._data.values() + for col_data in self._columns ), verify=False, ) @@ -3664,7 +3666,7 @@ def rename( name: col.find_and_replace( to_replace, vals, is_all_na ) - for name, col in self.index._data.items() + for name, col in self.index._column_labels_and_values } ) except OverflowError: @@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - prefix + col_name for col_name in list(self._data.keys()) - ] + out.columns = [prefix + col_name for col_name in self._column_names] return out @_performance_tracking @@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - col_name + suffix for col_name in list(self._data.keys()) - ] + out.columns = [col_name + suffix for col_name in self._column_names] return out @_performance_tracking @@ -4805,7 +4803,7 @@ def _func(x): # pragma: no cover # TODO: naive implementation # this could be written as a single kernel result = {} - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: apply_sr = Series._from_column(col) result[name] = apply_sr.apply(_func)._column @@ -5444,7 +5442,7 @@ def to_pandas( out_index = self.index.to_pandas() out_data = { i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) - for i, col in enumerate(self._data.columns) + for i, col in enumerate(self._columns) } out_df = pd.DataFrame(out_data, index=out_index) @@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table: index = index._as_int_index() index.name = "__index_level_0__" if isinstance(index, MultiIndex): - index_descr = list(index._data.names) + index_descr = index._column_names index_levels = index.levels else: index_descr = ( index.names if index.name is not None else ("index",) ) data = data.copy(deep=False) - for gen_name, col_name in zip(index_descr, index._data.names): + for gen_name, col_name in zip( + index_descr, index._column_names + ): data._insert( data.shape[1], gen_name, @@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[self[col] for col in self._data.names], + columns_to_convert=[self[col] for col in self._column_names], df=self, column_names=out.schema.names, index_levels=index_levels, @@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): "column_dtypes is currently not supported." ) members = [("index", self.index.dtype)] if index else [] - members += [(col, self[col].dtype) for col in self._data.names] + members += list(self._dtypes) dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_numpy() - for col in self._data.names: + for col in self._column_names: ret[col] = self[col].to_numpy() return ret @@ -6059,7 +6059,7 @@ def quantile( ) if columns is None: - columns = data_df._data.names + columns = set(data_df._column_names) if isinstance(q, numbers.Number): q_is_number = True @@ -6084,7 +6084,7 @@ def quantile( # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" result = {} - for k in data_df._data.names: + for k in data_df._column_names: if k in columns: ser = data_df[k] res = ser.quantile( @@ -6198,7 +6198,7 @@ def make_false_column_like_self(): if isinstance(values, DataFrame) else {name: values._column for name in self._data} ) - for col, self_col in self._data.items(): + for col, self_col in self._column_labels_and_values: if col in other_cols: other_col = other_cols[col] self_is_cat = isinstance(self_col, CategoricalColumn) @@ -6231,13 +6231,13 @@ def make_false_column_like_self(): else: result[col] = make_false_column_like_self() elif is_dict_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: if name in values: result[name] = col.isin(values[name]) else: result[name] = make_false_column_like_self() elif is_list_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: result[name] = col.isin(values) else: raise TypeError( @@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable else as_column(True, length=len(filtered._data[name])) - for name in filtered._data.names + for name in filtered._column_names } ) mask = mask.all(axis=1) @@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False): length = len(self) return Series._from_column( as_column([length - col.null_count for col in self._columns]), - index=cudf.Index(self._data.names), + index=cudf.Index(self._column_names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6409,7 +6409,7 @@ def _reduce( return source._apply_cupy_method_axis_1(op, **kwargs) else: axis_0_results = [] - for col_label, col in source._data.items(): + for col_label, col in source._column_labels_and_values: try: axis_0_results.append(getattr(col, op)(**kwargs)) except AttributeError as err: @@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna, numeric_only ) - for col in prepared._data.names: + for col in prepared._column_names: if prepared._data[col].nullable: prepared._data[col] = ( prepared._data[col] @@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._data.items(): + for k, col in self._column_labels_and_values: infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: df._insert(len(df._data), k, col) @@ -6840,7 +6840,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` column_idx_df = pd.DataFrame( - data=range(len(self._data)), index=named_levels + data=range(self._num_columns), index=named_levels ) column_indices: list[list[int]] = [] @@ -7392,17 +7392,17 @@ def to_struct(self, name=None): ----- Note: a copy of the columns is made. """ - if not all(isinstance(name, str) for name in self._data.names): + if not all(isinstance(name, str) for name in self._column_names): warnings.warn( "DataFrame contains non-string column name(s). Struct column " "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - fields = {str(name): col.dtype for name, col in self._data.items()} + fields = {str(name): dtype for name, dtype in self._dtypes} col = StructColumn( data=None, dtype=cudf.StructDtype(fields=fields), - children=tuple(col.copy(deep=True) for col in self._data.columns), + children=tuple(col.copy(deep=True) for col in self._columns), size=len(self), offset=0, ) @@ -7984,7 +7984,7 @@ def value_counts( diff = set(subset) - set(self._data) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") - columns = list(self._data.names) if subset is None else subset + columns = list(self._column_names) if subset is None else subset result = ( self.groupby( by=columns, @@ -8105,7 +8105,7 @@ def func(left, right, output): right._column_names ) elif _is_scalar_or_zero_d_array(right): - for name, col in output._data.items(): + for name, col in output._column_labels_and_values: output._data[name] = col.fillna(value) return output else: @@ -8387,7 +8387,7 @@ def extract_col(df, col): and col not in df.index._data and not isinstance(df.index, MultiIndex) ): - return df.index._data.columns[0] + return df.index._column return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b2bc85b13b..98af006f6e5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property - def _dtypes(self) -> abc.Iterable: - return zip(self._data.names, (col.dtype for col in self._data.columns)) + def _column_labels_and_values( + self, + ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]: + return zip(self._column_names, self._columns) + + @property + def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]: + for label, col in self._column_labels_and_values: + yield label, col.dtype @property def ndim(self) -> int: @@ -87,7 +94,7 @@ def serialize(self): # TODO: See if self._data can be serialized outright header = { "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(tuple(self._data.names)), + "column_names": pickle.dumps(self._column_names), "column_rangeindex": pickle.dumps(self._data.rangeindex), "column_multiindex": pickle.dumps(self._data.multiindex), "column_label_dtype": pickle.dumps(self._data.label_dtype), @@ -156,7 +163,7 @@ def _mimic_inplace( self, result: Self, inplace: bool = False ) -> Self | None: if inplace: - for col in self._data: + for col in self._column_names: if col in result._data: self._data[col]._mimic_inplace( result._data[col], inplace=True @@ -267,7 +274,7 @@ def __len__(self) -> int: def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: casted = ( col.astype(dtype.get(col_name, col.dtype), copy=copy) - for col_name, col in self._data.items() + for col_name, col in self._column_labels_and_values ) ca = self._data._from_columns_like_self(casted, verify=False) return self._from_data_like_self(ca) @@ -338,9 +345,7 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip( - self._data.values(), other._data.values() - ) + for self_col, other_col in zip(self._columns, other._columns) ) @_performance_tracking @@ -434,11 +439,9 @@ def to_array( if dtype is None: if ncol == 1: - dtype = next(iter(self._data.values())).dtype + dtype = next(self._dtypes)[1] else: - dtype = find_common_type( - [col.dtype for col in self._data.values()] - ) + dtype = find_common_type([dtype for _, dtype in self._dtypes]) if not isinstance(dtype, numpy.dtype): raise NotImplementedError( @@ -446,12 +449,12 @@ def to_array( ) if self.ndim == 1: - return to_array(self._data.columns[0], dtype) + return to_array(self._columns[0], dtype) else: matrix = module.empty( shape=(len(self), ncol), dtype=dtype, order="F" ) - for i, col in enumerate(self._data.values()): + for i, col in enumerate(self._columns): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more # suitable error. @@ -751,7 +754,7 @@ def fillna( filled_columns = [ col.fillna(value[name], method) if name in value else col.copy() - for name, col in self._data.items() + for name, col in self._column_labels_and_values ] return self._mimic_inplace( @@ -988,7 +991,10 @@ def to_arrow(self): index: [[1,2,3]] """ return pa.Table.from_pydict( - {str(name): col.to_arrow() for name, col in self._data.items()} + { + str(name): col.to_arrow() + for name, col in self._column_labels_and_values + } ) @_performance_tracking @@ -1012,7 +1018,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: See `ColumnBase._with_type_metadata` for more information. """ - for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes): + for (name, col), (_, dtype) in zip( + self._column_labels_and_values, other._dtypes + ): self._data.set_by_label(name, col._with_type_metadata(dtype)) return self @@ -1422,7 +1430,7 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ + libcudf.copying.columns_split(list(self._columns), splits)[ split_idx ], self._column_names, @@ -1432,7 +1440,7 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode([*self._columns]) + columns, indices = libcudf.transform.table_encode(list(self._columns)) keys = self._from_columns_like_self(columns) return keys, indices @@ -1578,7 +1586,7 @@ def __neg__(self): col.unary_operator("not") if col.dtype.kind == "b" else -1 * col - for col in self._data.columns + for col in self._columns ) ) ) @@ -1840,9 +1848,7 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - self._data._from_columns_like_self( - (~col for col in self._data.columns) - ) + self._data._from_columns_like_self((~col for col in self._columns)) ) @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6424c8af877..cb8cd0cd28b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list( - self.grouping.keys.drop_duplicates()._data.columns - ) - right_cols = list(result_index._data.columns) + left_cols = list(self.grouping.keys.drop_duplicates()._columns) + right_cols = list(result_index._columns) join_keys = [ _match_join_keys(lcol, rcol, "left") for lcol, rcol in zip(left_cols, right_cols) @@ -1483,7 +1481,7 @@ def _post_process_chunk_results( # the column name should be, especially if we applied # a nameless UDF. result = result.to_frame( - name=grouped_values._data.names[0] + name=grouped_values._column_names[0] ) else: index_data = group_keys._data.copy(deep=True) @@ -1632,7 +1630,7 @@ def mult(df): if func in {"sum", "product"}: # For `sum` & `product`, boolean types # will need to result in `int64` type. - for name, col in res._data.items(): + for name, col in res._column_labels_and_values: if col.dtype.kind == "b": res._data[name] = col.astype("int") return res @@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): def _reduce_numeric_only(self, op: str): columns = list( name - for name in self.obj._data.names - if ( - is_numeric_dtype(self.obj._data[name].dtype) - and name not in self.grouping.names - ) + for name, dtype in self.obj._dtypes + if (is_numeric_dtype(dtype) and name not in self.grouping.names) ) return self[columns].agg(op) @@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame: """ # If the key columns are in `obj`, filter them out value_column_names = [ - x for x in self._obj._data.names if x not in self._named_columns + x for x in self._obj._column_names if x not in self._named_columns ] value_columns = self._obj._data.select_by_label(value_column_names) return self._obj.__class__._from_data(value_columns) @@ -3224,8 +3219,8 @@ def _handle_series(self, by): self.names.append(by.name) def _handle_index(self, by): - self._key_columns.extend(by._data.columns) - self.names.extend(by._data.names) + self._key_columns.extend(by._columns) + self.names.extend(by._column_names) def _handle_mapping(self, by): by = cudf.Series(by.values(), index=by.keys()) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b2bd20c4982..cd07c58c5d9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -122,13 +122,13 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="right", ascending=sort_vals.is_monotonic_increasing, @@ -286,6 +286,20 @@ def name(self): def name(self, value): self._name = value + @property + @_performance_tracking + def _column_names(self) -> tuple[Any]: + return (self.name,) + + @property + @_performance_tracking + def _columns(self) -> tuple[ColumnBase]: + return (self._values,) + + @property + def _column_labels_and_values(self) -> Iterable: + return zip(self._column_names, self._columns) + @property # type: ignore @_performance_tracking def start(self) -> int: @@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } data = self._apply_cupy_ufunc_to_operands( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fd6bf37f0e6..810d4ad74e7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -294,7 +294,7 @@ def _num_rows(self) -> int: @property def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? - return self.index._data.names + return self.index._column_names @classmethod def _from_data( @@ -307,6 +307,7 @@ def _from_data( raise ValueError( f"index must be None or a cudf.Index not {type(index).__name__}" ) + # out._num_rows requires .index to be defined out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -882,7 +883,7 @@ def replace( columns_dtype_map=dict(self._dtypes), ) copy_data = [] - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: try: replaced = col.find_and_replace( to_replace_per_column[name], @@ -2703,11 +2704,11 @@ def sort_index( by.extend( filter( lambda n: n not in handled, - self.index._data.names, + self.index._column_names, ) ) else: - by = list(idx._data.names) + by = list(idx._column_names) inds = idx._get_sorted_inds( by=by, ascending=ascending, na_position=na_position @@ -3013,7 +3014,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: columns_to_slice = [ *( - self.index._data.columns + self.index._columns if keep_index and not has_range_index else [] ), @@ -3210,7 +3211,7 @@ def _empty_like(self, keep_index=True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ - *(self.index._data.columns if keep_index else ()), + *(self.index._columns if keep_index else ()), *self._columns, ] ), @@ -3227,7 +3228,7 @@ def _split(self, splits, keep_index=True): columns_split = libcudf.copying.columns_split( [ - *(self.index._data.columns if keep_index else []), + *(self.index._columns if keep_index else []), *self._columns, ], splits, @@ -3763,8 +3764,8 @@ def _reindex( idx_dtype_match = (df.index.nlevels == index.nlevels) and all( _is_same_dtype(left_dtype, right_dtype) for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), + (dtype for _, dtype in df.index._dtypes), + (dtype for _, dtype in index._dtypes), ) ) @@ -3783,7 +3784,7 @@ def _reindex( (name or 0) if isinstance(self, cudf.Series) else name: col - for name, col in df._data.items() + for name, col in df._column_labels_and_values }, index=df.index, ) @@ -3794,7 +3795,7 @@ def _reindex( index = index if index is not None else df.index if column_names is None: - names = list(df._data.names) + names = list(df._column_names) level_names = self._data.level_names multiindex = self._data.multiindex rangeindex = self._data.rangeindex @@ -3948,7 +3949,7 @@ def round(self, decimals=0, how="half_even"): col.round(decimals[name], how=how) if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) - for name, col in self._data.items() + for name, col in self._column_labels_and_values ) return self._from_data_like_self( self._data._from_columns_like_self(cols) @@ -4270,7 +4271,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count @@ -4305,7 +4306,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - [*self.index._data.columns, *data_columns], + [*self.index._columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -4853,7 +4854,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # This works for Index too inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } index = self.index @@ -4933,7 +4934,7 @@ def repeat(self, repeats, axis=None): """ res = self._from_columns_like_self( Frame._repeat( - [*self.index._data.columns, *self._columns], repeats, axis + [*self.index._columns, *self._columns], repeats, axis ), self._column_names, self._index_names, @@ -6224,7 +6225,7 @@ def _preprocess_subset(self, subset): not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) - and subset in self._data.names + and subset in self._column_names ): subset = (subset,) diff = set(subset) - set(self._data) @@ -6306,8 +6307,8 @@ def rank( ) numeric_cols = ( name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) + for name, dtype in self._dtypes + if _is_non_decimal_numeric_dtype(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index b65bc7af832..cfeaca00888 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -140,11 +140,15 @@ def __init__( # right_on. self._using_left_index = bool(left_index) left_on = ( - lhs.index._data.names if left_index else left_on if left_on else on + lhs.index._column_names + if left_index + else left_on + if left_on + else on ) self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names + rhs.index._column_names if right_index else right_on if right_on @@ -334,18 +338,18 @@ def _merge_results( # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set( - right_result._data.names + common_names = set(left_result._column_names) & set( + right_result._column_names ) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col - for name, col in left_result._data.items() + for name, col in left_result._column_labels_and_values } # The right table follows the same rule as the left table except that # key columns from the right table are removed. - for name, col in right_result._data.items(): + for name, col in right_result._column_labels_and_values: if name in common_names: if name not in self._key_columns_with_same_name: data[f"{name}{self.rsuffix}"] = col @@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # producing the input result. by: list[Any] = [] if self._using_left_index and self._using_right_index: - by.extend(result.index._data.columns) + by.extend(result.index._columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e00890ac5c3..6de3981ba66 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator + from collections.abc import Generator, Hashable from typing_extensions import Self @@ -233,8 +233,8 @@ def names(self, value): # to unexpected behavior in some cases. This is # definitely buggy, but we can't disallow non-unique # names either... - self._data = self._data.__class__( - dict(zip(value, self._data.values())), + self._data = type(self._data)( + dict(zip(value, self._columns)), level_names=self._data.level_names, verify=False, ) @@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False): @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" - lookup = cudf.DataFrame() + lookup_dict = {} for i, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[i] = cudf.Series(row) - frame = cudf.DataFrame(dict(enumerate(index._data.columns))) + lookup_dict[i] = row + lookup = cudf.DataFrame(lookup_dict) + frame = cudf.DataFrame._from_data( + ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) data_table = cudf.concat( [ frame, cudf.DataFrame._from_data( - {"idx": column.as_column(range(len(frame)))} + ColumnAccessor( + {"idx": column.as_column(range(len(frame)))}, + verify=False, + ) ), ], axis=1, @@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # TODO: Remove this after merge/join # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): - lookup_order = "_" + "_".join(map(str, lookup._data.names)) + lookup_order = "_" + "_".join(map(str, lookup._column_names)) lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] @@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_column(index._data.columns[k]), + cudf.Series._from_column(index._columns[k]), ) # determine if we should downcast from a DataFrame to a Series @@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key): ) if need_downcast: result = result.T - return result[result._data.names[0]] + return result[result._column_names[0]] if len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column result = cudf.Series._from_data( - {}, name=tuple(col[0] for col in index._data.columns) + {}, name=tuple(col[0] for col in index._columns) ) elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - *_, last_column = index._data.columns + last_column = index._columns[-1] out_index = cudf.Index._from_column( last_column, name=index.names[-1] ) @@ -894,7 +900,7 @@ def __eq__(self, other): [ self_col.equals(other_col) for self_col, other_col in zip( - self._data.values(), other._data.values() + self._columns, other._columns ) ] ) @@ -1041,9 +1047,11 @@ def to_frame( ) @_performance_tracking - def get_level_values(self, level) -> cudf.Index: + def _level_to_ca_label(self, level) -> tuple[Hashable, int]: """ - Return the values at the requested level + Convert a level to a ColumAccessor label and an integer position. + + Useful if self._column_names != self.names. Parameters ---------- @@ -1051,10 +1059,13 @@ def get_level_values(self, level) -> cudf.Index: Returns ------- - An Index containing the values at the requested level. + tuple[Hashable, int] + (ColumnAccessor label corresponding to level, integer position of the level) """ - colnames = self._data.names - if level not in colnames: + colnames = self._column_names + try: + level_idx = colnames.index(level) + except ValueError: if isinstance(level, int): if level < 0: level = level + len(colnames) @@ -1067,8 +1078,22 @@ def get_level_values(self, level) -> cudf.Index: level = colnames[level_idx] else: raise KeyError(f"Level not found: '{level}'") - else: - level_idx = colnames.index(level) + return level, level_idx + + @_performance_tracking + def get_level_values(self, level) -> cudf.Index: + """ + Return the values at the requested level + + Parameters + ---------- + level : int or label + + Returns + ------- + An Index containing the values at the requested level. + """ + level, level_idx = self._level_to_ca_label(level) level_values = cudf.Index._from_column( self._data[level], name=self.names[level_idx] ) @@ -1420,57 +1445,6 @@ def from_arrays( codes=codes, levels=levels, sortorder=sortorder, names=names ) - @_performance_tracking - def _poplevels(self, level) -> None | MultiIndex | cudf.Index: - """ - Remove and return the specified levels from self. - - Parameters - ---------- - level : level name or index, list - One or more levels to remove - - Returns - ------- - Index composed of the removed levels. If only a single level - is removed, a flat index is returned. If no levels are specified - (empty list), None is returned. - """ - if not pd.api.types.is_list_like(level): - level = (level,) - - ilevels = sorted(self._level_index_from_level(lev) for lev in level) - - if not ilevels: - return None - - popped_data = {} - popped_names = [] - names = list(self.names) - - # build the popped data and names - for i in ilevels: - n = self._data.names[i] - popped_data[n] = self._data[n] - popped_names.append(self.names[i]) - - # pop the levels out from self - # this must be done iterating backwards - for i in reversed(ilevels): - n = self._data.names[i] - names.pop(i) - popped_data[n] = self._data.pop(n) - - # construct the popped result - popped = cudf.core.index._index_from_data(popped_data) - popped.names = popped_names - - # update self - self.names = names - self._levels, self._codes = _compute_levels_and_codes(self._data) - - return popped - @_performance_tracking def swaplevel(self, i=-2, j=-1) -> Self: """ @@ -1507,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self: ('aa', 'b')], ) """ - name_i = self._data.names[i] if isinstance(i, int) else i - name_j = self._data.names[j] if isinstance(j, int) else j + name_i = self._column_names[i] if isinstance(i, int) else i + name_j = self._column_names[j] if isinstance(j, int) else j new_data = {} - for k, v in self._data.items(): + for k, v in self._column_labels_and_values: if k not in (name_i, name_j): new_data[k] = v elif k == name_i: @@ -1523,7 +1497,7 @@ def swaplevel(self, i=-2, j=-1) -> Self: return midx @_performance_tracking - def droplevel(self, level=-1) -> MultiIndex | cudf.Index: + def droplevel(self, level=-1) -> Self | cudf.Index: """ Removes the specified levels from the MultiIndex. @@ -1578,11 +1552,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index: >>> idx.droplevel(["first", "second"]) Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ - mi = self.copy(deep=False) - mi._poplevels(level) - if mi.nlevels == 1: - return mi.get_level_values(mi.names[0]) + if is_scalar(level): + level = (level,) + elif len(level) == 0: + return self + + new_names = list(self.names) + new_data = self._data.copy(deep=False) + for i in sorted( + (self._level_index_from_level(lev) for lev in level), reverse=True + ): + new_names.pop(i) + new_data.pop(self._data.names[i]) + + if len(new_data) == 1: + return cudf.core.index._index_from_data(new_data) else: + mi = MultiIndex._from_data(new_data) + mi.names = new_names return mi @_performance_tracking @@ -1886,7 +1873,7 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - def _level_index_from_level(self, level): + def _level_index_from_level(self, level) -> int: """ Return level index from given level name or index """ @@ -1935,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._data.columns, self._data.columns) + for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) scatter_map, indices = libcudf.join.join( @@ -2132,7 +2119,7 @@ def _split_columns_by_levels( lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + for i, (name, col) in enumerate(zip(self.names, self._columns)): if in_levels and i in level_indices: name = f"level_{i}" if name is None else name yield name, col @@ -2173,9 +2160,7 @@ def _columns_for_reset_index( ) -> Generator[tuple[Any, column.ColumnBase], None, None]: """Return the columns and column names for .reset_index""" if levels is None: - for i, (col, name) in enumerate( - zip(self._data.columns, self.names) - ): + for i, (col, name) in enumerate(zip(self._columns, self.names)): yield f"level_{i}" if name is None else name, col else: yield from self._split_columns_by_levels(levels, in_levels=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3d205957126..401fef67ee6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,6 +12,7 @@ from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default +from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column_accessor import ColumnAccessor @@ -409,7 +410,7 @@ def concat( result_columns = None if keys_objs is None: for o in objs: - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: if name in result_data: raise NotImplementedError( f"A Column with duplicate name found: {name}, cuDF " @@ -437,7 +438,7 @@ def concat( else: # All levels in the multiindex label must have the same type has_multiple_level_types = ( - len({type(name) for o in objs for name in o._data.keys()}) > 1 + len({type(name) for o in objs for name in o._column_names}) > 1 ) if has_multiple_level_types: raise NotImplementedError( @@ -446,7 +447,7 @@ def concat( "the labels to the same type." ) for k, o in zip(keys_objs, objs): - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it # to handle cases where dfs and srs are concatenated @@ -738,7 +739,8 @@ def get_dummies( sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional - Right now this is NON-FUNCTIONAL argument in rapids. + Whether to get k-1 dummies out of k categorical levels by removing the + first level. columns : sequence of str, optional Names of columns to encode. If not provided, will attempt to encode all columns. Note this is different from pandas default behavior, which @@ -806,9 +808,6 @@ def get_dummies( if sparse: raise NotImplementedError("sparse is not supported yet") - if drop_first: - raise NotImplementedError("drop_first is not supported yet") - if isinstance(data, cudf.DataFrame): encode_fallback_dtypes = ["object", "category"] @@ -844,7 +843,7 @@ def get_dummies( else: result_data = { col_name: col - for col_name, col in data._data.items() + for col_name, col in data._column_labels_and_values if col_name not in columns } @@ -862,6 +861,7 @@ def get_dummies( prefix=prefix_map.get(name, prefix), prefix_sep=prefix_sep_map.get(name, prefix_sep), dtype=dtype, + drop_first=drop_first, ) result_data.update(col_enc_data) return cudf.DataFrame._from_data(result_data, index=data.index) @@ -874,6 +874,7 @@ def get_dummies( prefix=prefix, prefix_sep=prefix_sep, dtype=dtype, + drop_first=drop_first, ) return cudf.DataFrame._from_data(data, index=ser.index) @@ -942,7 +943,7 @@ def _merge_sorted( columns = [ [ - *(obj.index._data.columns if not ignore_index else ()), + *(obj.index._columns if not ignore_index else ()), *obj._columns, ] for obj in objs @@ -984,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._data.items(): + for col_label, col in df._column_labels_and_values: names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1008,7 +1009,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._data.names, + level_names=(None,) + columns._column_names, verify=False, ) return cudf.DataFrame._from_data( @@ -1086,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): # Create a DataFrame composed of columns from both # columns and index ca = ColumnAccessor( - dict( - enumerate( - itertools.chain(index._data.columns, columns._data.columns) - ) - ), + dict(enumerate(itertools.chain(index._columns, columns._columns))), verify=False, ) columns_index = cudf.DataFrame._from_data(ca) @@ -1227,13 +1224,24 @@ def unstack(df, level, fill_value=None, sort: bool = True): ) return res else: - df = df.copy(deep=False) - columns = df.index._poplevels(level) - index = df.index - result = _pivot(df, index, columns) - if result.index.nlevels == 1: - result.index = result.index.get_level_values(result.index.names[0]) - return result + index = df.index.droplevel(level) + if is_scalar(level): + columns = df.index.get_level_values(level) + else: + new_names = [] + ca_data = {} + for lev in level: + ca_level, level_idx = df.index._level_to_ca_label(lev) + new_names.append(df.index.names[level_idx]) + ca_data[ca_level] = df.index._data[ca_level] + columns = type(df.index)._from_data( + ColumnAccessor(ca_data, verify=False) + ) + columns.names = new_names + result = _pivot(df, index, columns) + if result.index.nlevels == 1: + result.index = result.index.get_level_values(result.index.names[0]) + return result def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase: @@ -1256,6 +1264,7 @@ def _one_hot_encode_column( prefix: str | None, prefix_sep: str | None, dtype: Dtype | None, + drop_first: bool, ) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with @@ -1276,6 +1285,8 @@ def _one_hot_encode_column( ) data = one_hot_encode(column, categories) + if drop_first and len(data): + data.pop(next(iter(data))) if prefix is not None and prefix_sep is not None: data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()} if dtype: @@ -1545,7 +1556,7 @@ def pivot_table( if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] table_columns = tuple( - map(lambda column: column[1:], table._data.names) + map(lambda column: column[1:], table._column_names) ) table.columns = pd.MultiIndex.from_tuples( tuples=table_columns, names=column_names diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 7197560b5a4..68f34fa28ff 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -186,7 +186,7 @@ def to_datetime( if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] - req = list(set(required) - set(arg._data.names)) + req = list(set(required) - set(arg._column_names)) if len(req): err_req = ",".join(req) raise ValueError( @@ -196,7 +196,7 @@ def to_datetime( ) # replace passed column name with values in _unit_map - got_units = {k: get_units(k) for k in arg._data.names} + got_units = {k: get_units(k) for k in arg._column_names} unit_rev = {v: k for k, v in got_units.items()} # keys we don't recognize diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 265b87350ae..3af662b62ea 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - if any(col.has_nulls() for col in frame._data.values()): + if any(col.has_nulls() for col in frame._columns): return False np_field_types = np.dtype( list( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 6d7362952c9..bfe716f0afc 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - if str(col.dtype) in supported_types - else np.dtype("O") - for colname, col in frame._data.items() + colname: dtype if str(dtype) in supported_types else np.dtype("O") + for colname, dtype in frame._dtypes } def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - for colname, col in frame._data.items() - if str(col.dtype) in supported_types + colname: dtype + for colname, dtype in frame._dtypes + if str(dtype) in supported_types } def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col - for colname, col in frame._data.items() + for colname, col in frame._column_labels_and_values if str(col.dtype) in supported_types } @@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): *cudautils.make_cache_key( func, tuple(_all_dtypes_from_frame(frame).values()) ), - *(col.mask is None for col in frame._data.values()), - *frame._data.keys(), + *(col.mask is None for col in frame._columns), + *frame._column_names, scalar_argtypes, suffix, ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a9c20150930..3dc8915bfd1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -186,13 +186,13 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) - for col in df._data.columns: - if isinstance(col, cudf.core.column.ListColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.ListDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "list columns." ) - elif isinstance(col, cudf.core.column.StructColumn): + elif isinstance(dtype, cudf.StructDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "Struct columns." @@ -203,12 +203,11 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) - for col in df._data.columns + isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) - for col_name, col in df._data.items(): - if isinstance(col, cudf.core.column.CategoricalColumn): + for col_name, col in df._column_labels_and_values: + if isinstance(col.dtype, cudf.CategoricalDtype): df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 1347b2cc38f..fe8e446f9c0 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -79,13 +79,13 @@ def to_dlpack(cudf_obj): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype) - for col in gdf._data.columns + not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + for _, dtype in gdf._dtypes ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [col.dtype for col in gdf._data.columns] + [dtype for _, dtype in gdf._dtypes] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index fd246c6215f..c54293badbe 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -396,8 +396,8 @@ def to_orc( ): """{docstring}""" - for col in df._data.columns: - if isinstance(col, cudf.core.column.CategoricalColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 62be7378e9e..ce99f98b559 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -64,7 +64,7 @@ def _write_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -149,7 +149,7 @@ def write_to_dataset( return_metadata=False, statistics="ROWGROUP", int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -205,7 +205,7 @@ def write_to_dataset( If ``False``, timestamps will not be altered. row_group_size_bytes: integer or None, default None Maximum size of each stripe of the output. - If None, 134217728 (128MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -980,7 +980,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 3a82829eb7a..e0d3d9101a9 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -10,6 +10,7 @@ """ import argparse +import code import runpy import sys import tempfile @@ -21,6 +22,8 @@ @contextmanager def profile(function_profile, line_profile, fn): + if fn is None and (line_profile or function_profile): + raise RuntimeError("Enabling the profiler requires a script name.") if line_profile: with open(fn) as f: lines = f.readlines() @@ -54,6 +57,11 @@ def main(): dest="module", nargs=1, ) + parser.add_argument( + "-c", + dest="cmd", + nargs=1, + ) parser.add_argument( "--profile", action="store_true", @@ -72,9 +80,18 @@ def main(): args = parser.parse_args() + if args.cmd: + f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") + f.write(args.cmd[0].encode()) + f.seek(0) + args.args.insert(0, f.name) + install() - with profile(args.profile, args.line_profile, args.args[0]) as fn: - args.args[0] = fn + + script_name = args.args[0] if len(args.args) > 0 else None + with profile(args.profile, args.line_profile, script_name) as fn: + if script_name is not None: + args.args[0] = fn if args.module: (module,) = args.module # run the module passing the remaining arguments @@ -85,6 +102,21 @@ def main(): # Remove ourself from argv and continue sys.argv[:] = args.args runpy.run_path(args.args[0], run_name="__main__") + else: + if sys.stdin.isatty(): + banner = f"Python {sys.version} on {sys.platform}" + site_import = not sys.flags.no_site + if site_import: + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + banner += "\n" + cprt + else: + # Don't show prompts or banners if stdin is not a TTY + sys.ps1 = "" + sys.ps2 = "" + banner = "" + + # Launch an interactive interpreter + code.interact(banner=banner, exitmsg="") if __name__ == "__main__": diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index afa1ce5f86c..bf2ee6ae624 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -881,6 +881,20 @@ def _assert_fast_slow_eq(left, right): assert_eq(left, right) +def _fast_function_call(): + """ + Placeholder fast function for pytest profiling purposes. + """ + return None + + +def _slow_function_call(): + """ + Placeholder slow function for pytest profiling purposes. + """ + return None + + def _fast_slow_function_call( func: Callable, /, @@ -910,6 +924,7 @@ def _fast_slow_function_call( # try slow path raise Exception() fast = True + _fast_function_call() if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): try: with nvtx.annotate( @@ -952,6 +967,7 @@ def _fast_slow_function_call( from ._logger import log_fallback log_fallback(slow_args, slow_kwargs, err) + _slow_function_call() with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 505a40b0bfa..d12d2697729 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -1,10 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import contextlib +import json import os import sys +import traceback +from collections import defaultdict from functools import wraps import pytest @@ -36,4 +39,58 @@ def patch_testing_functions(): pytest.raises = replace_kwargs({"match": None})(pytest.raises) +# Dictionary to store function call counts +function_call_counts = {} # type: ignore + +# The specific functions to track +FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"} + + +def find_pytest_file(frame): + stack = traceback.extract_stack() + absolute_paths = [frame.filename for frame in stack] + for file in absolute_paths: + if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[ + -1 + ].startswith("test_"): + return str(file).rsplit("pandas-tests/", 1)[-1] + return None + + +def trace_calls(frame, event, arg): + if event != "call": + return + code = frame.f_code + func_name = code.co_name + + if func_name in FUNCTION_NAME: + filename = find_pytest_file(frame) + if filename is None: + return + if filename not in function_call_counts: + function_call_counts[filename] = defaultdict(int) + function_call_counts[filename][func_name] += 1 + + +def pytest_sessionstart(session): + # Set the profile function to trace calls + sys.setprofile(trace_calls) + + +def pytest_sessionfinish(session, exitstatus): + # Remove the profile function + sys.setprofile(None) + + +@pytest.hookimpl(trylast=True) +def pytest_unconfigure(config): + if hasattr(config, "workerinput"): + # Running in xdist worker, write the counts before exiting + worker_id = config.workerinput["workerid"] + output_file = f"function_call_counts_worker_{worker_id}.json" + with open(output_file, "w") as f: + json.dump(function_call_counts, f, indent=4) + print(f"Function call counts have been written to {output_file}") + + sys.path.append(os.path.dirname(__file__)) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 9c65b74d081..9b9ce026571 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -64,8 +64,6 @@ markers = [ "skip_ubsan: Tests known to fail UBSAN check", ] EOF - # append the contents of patch-confest.py to conftest.py - cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py # Substitute `pandas.tests` with a relative import. # This will depend on the location of the test module relative to @@ -137,7 +135,7 @@ and not test_eof_states \ and not test_array_tz" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI -PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ +PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ --import-mode=importlib \ @@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ mv *.json .. cd .. - rm -rf pandas-testing/pandas-tests/ diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index ffd2abb960d..4ea0b3b4413 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -12,7 +12,9 @@ """ import argparse +import glob import json +import os from rich.console import Console from rich.table import Table @@ -57,6 +59,44 @@ def get_per_module_results(log_file_name): per_module_results[module_name].setdefault(outcome, 0) per_module_results[module_name]["total"] += 1 per_module_results[module_name][outcome] += 1 + + directory = os.path.dirname(log_file_name) + pattern = os.path.join(directory, "function_call_counts_worker_*.json") + matching_files = glob.glob(pattern) + function_call_counts = {} + + for file in matching_files: + with open(file) as f: + function_call_count = json.load(f) + if not function_call_counts: + function_call_counts.update(function_call_count) + else: + for key, value in function_call_count.items(): + if key not in function_call_counts: + function_call_counts[key] = value + else: + if "_slow_function_call" not in function_call_counts[key]: + function_call_counts[key]["_slow_function_call"] = 0 + if "_fast_function_call" not in function_call_counts[key]: + function_call_counts[key]["_fast_function_call"] = 0 + function_call_counts[key]["_slow_function_call"] += ( + value.get("_slow_function_call", 0) + ) + function_call_counts[key]["_fast_function_call"] += ( + value.get("_fast_function_call", 0) + ) + + for key, value in per_module_results.items(): + if key in function_call_counts: + per_module_results[key]["_slow_function_call"] = ( + function_call_counts[key].get("_slow_function_call", 0) + ) + per_module_results[key]["_fast_function_call"] = ( + function_call_counts[key].get("_fast_function_call", 0) + ) + else: + per_module_results[key]["_slow_function_call"] = 0 + per_module_results[key]["_fast_function_call"] = 0 return per_module_results diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 31ad24a4664..668e7a77454 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -676,7 +676,7 @@ def assert_frame_equal( if check_like: left, right = left.reindex(index=right.index), right - right = right[list(left._data.names)] + right = right[list(left._column_names)] # index comparison assert_index_equal( diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index 2136bca0e28..8a594794fac 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -14,3 +14,4 @@ filterwarnings = ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning +addopts = --tb=native diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b1e095e8853..c41be3e4428 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): mi1 = gdf.groupby(["Date", "Symbol"]).mean().index mi2 = mi1.copy(deep=deep) - lchildren = [col.children for _, col in mi1._data.items()] - rchildren = [col.children for _, col in mi2._data.items()] + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] # Flatten lchildren = reduce(operator.add, lchildren) @@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() - ] - rptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() - ] + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index cc17dc46e0a..e054143b438 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated(): df = cudf.DataFrame(range(3)) with pytest.warns(FutureWarning): cudf.get_dummies(df, cats={0: [0, 1, 2]}) + + +def test_get_dummies_drop_first_series(): + result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True) + expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) + assert_eq(result, expected) + + +def test_get_dummies_drop_first_dataframe(): + result = cudf.get_dummies( + cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), + drop_first=True, + ) + expected = pd.get_dummies( + pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b59a7eef08..7f1b0b1cd46 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema): df1 = cudf.DataFrame( { "i32": cudf.Series([None, None, None], dtype="int32"), - "i64": cudf.Series([1234, None, 123], dtype="int64"), - "list": list([[1, 2], [None, 4], [5, 6]]), + "i64": cudf.Series([1234, 467, 123], dtype="int64"), + "list": list([[1, 2], None, [None, 6]]), "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), "str": ["vfd", None, "ghu"], "d_list": list( @@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema): df2 = cudf.DataFrame( { - "str": ["abc", "def", None], + "str": ["abc", "def", "ghi"], "i64": cudf.Series([None, 65, 98], dtype="int64"), "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"), - "list": list([[7, 8], [9, 10], [None, 12]]), + "list": list([[7, 8], [9, 10], [11, 12]]), "d_list": list( [ [pd.Timedelta(minutes=4), None], - [None, None], + None, [pd.Timedelta(minutes=6), None], ] ), @@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs(): { "a": 1, "b": { - "inner_a": 10, - "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2}, + "a_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 2}, }, "c": 2, }, { "a": 3, - "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}}, + "b": {"b_a": 30, "b_b": {"b_b_a": 210}}, "c": 4, }, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6}, {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - { - "a": None, - "b": { - "inner_a": None, - "inner_b": {"inner_inner_b": None, "inner_inner_a": 10}, - }, - "c": 10, - }, + {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None}, ] data2 = [ - {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}}, - {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}}, - {"a": 5, "b": {"inner_b": None}}, - {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}}, - {"a": None, "b": {"inner_b": None}}, + {"a": 1, "b": {"b_b": {"b_b_a": None}}}, + {"a": 5, "b": {"b_b": None}}, + {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}}, + {"a": None, "b": {"b_b": None}}, None, - {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}}, ] # cuDF tables from struct data @@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs(): # Read the struct.b.inner_b.inner_inner_a column from parquet got = cudf.read_parquet( [buf1, buf2], - columns=["struct.b.inner_b.inner_inner_a"], + columns=["struct.b.b_b.b_b_a"], allow_mismatched_pq_schemas=True, ) got = ( cudf.Series(got["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a") + .struct.field("b_b") + .struct.field("b_b_a") ) # Read with chunked reader got_chunked = read_parquet_chunked( [buf1, buf2], - columns=["struct.b.inner_b.inner_inner_a"], + columns=["struct.b.b_b.b_b_a"], chunk_read_limit=240, pass_read_limit=240, allow_mismatched_pq_schemas=True, @@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs(): got_chunked = ( cudf.Series(got_chunked["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a") + .struct.field("b_b") + .struct.field("b_b_a") ) # Construct the expected series @@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs(): [ cudf.Series(df1["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a"), + .struct.field("b_b") + .struct.field("b_b_a"), cudf.Series(df2["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a"), + .struct.field("b_b") + .struct.field("b_b_a"), ] ).reset_index(drop=True) @@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error(): ) data1 = [ - {"a": 1, "b": {"inner_a": 1, "inner_b": 6}}, - {"a": 3, "b": {"inner_a": None, "inner_b": 2}}, + {"a": 1, "b": {"b_a": 1, "b_b": 6}}, + {"a": 3, "b": {"b_a": None, "b_b": 2}}, ] data2 = [ - {"b": {"inner_a": 1}, "c": "str"}, - {"b": {"inner_a": None}, "c": None}, + {"b": {"b_a": 1}, "c": "str"}, + {"b": {"b_a": None}, "c": None}, ] # cuDF tables from struct data @@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error(): ): cudf.read_parquet( [buf1, buf2], - columns=["struct.b.inner_b"], + columns=["struct.b.b_b"], allow_mismatched_pq_schemas=True, ) + + +def test_parquet_reader_mismatched_nullability(): + # Ensure that we can faithfully read the tables with mismatched nullabilities + df1 = cudf.DataFrame( + { + "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + None, + [pd.Timedelta(minutes=8), None], + ], + None, + ], + None, + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, None, 4123], dtype="int64"), + "int32": cudf.Series([1234, 123, 4123], dtype="int32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "string": cudf.Series(["kitten", "puppy", "cub"]), + } + ) + + df2 = cudf.DataFrame( + { + "timedelta": cudf.Series( + [None, None, None], dtype="timedelta64[ms]" + ), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)], + ], + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, 123, 4123], dtype="int64"), + "int32": cudf.Series([1234, None, 4123], dtype="int32"), + "list": list([[1, 2], None, [1, 2]]), + "datetime": cudf.Series( + [1234, None, 4123], dtype="datetime64[ms]" + ), + "string": cudf.Series(["kitten", None, "cub"]), + } + ) + + # Write tables to parquet with arrow schema for compatibility for duration column(s) + fname1 = BytesIO() + df1.to_parquet(fname1, store_schema=True) + fname2 = BytesIO() + df2.to_parquet(fname2, store_schema=True) + + # Read tables back with cudf and arrow in either order and compare + assert_eq( + cudf.read_parquet([fname1, fname2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([fname2, fname1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) + + +def test_parquet_reader_mismatched_nullability_structs(tmpdir): + data1 = [ + { + "a": "a", + "b": { + "b_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 12}, + }, + "c": [1, 2], + }, + { + "a": "b", + "b": { + "b_a": 30, + "b_b": {"b_b_b": 2, "b_b_a": 2}, + }, + "c": [3, 4], + }, + { + "a": "c", + "b": { + "b_a": 50, + "b_b": {"b_b_b": 4, "b_b_a": 5}, + }, + "c": [5, 6], + }, + { + "a": "d", + "b": { + "b_a": 135, + "b_b": {"b_b_b": 12, "b_b_a": 32}, + }, + "c": [7, 8], + }, + { + "a": "e", + "b": { + "b_a": 1, + "b_b": {"b_b_b": 1, "b_b_a": 5}, + }, + "c": [9, 10], + }, + { + "a": "f", + "b": { + "b_a": 32, + "b_b": {"b_b_b": 1, "b_b_a": 6}, + }, + "c": [11, 12], + }, + ] + + data2 = [ + { + "a": "g", + "b": { + "b_a": 10, + "b_b": {"b_b_b": None, "b_b_a": 2}, + }, + "c": None, + }, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, + {"a": "j", "b": None, "c": [8, 10]}, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, + None, + { + "a": None, + "b": {"b_a": None, "b_b": {"b_b_b": 1}}, + "c": [18, 19], + }, + {"a": None, "b": None, "c": None}, + ] + + pa_table1 = pa.Table.from_pydict({"struct": data1}) + df1 = cudf.DataFrame.from_arrow(pa_table1) + + pa_table2 = pa.Table.from_pydict({"struct": data2}) + df2 = cudf.DataFrame.from_arrow(pa_table2) + + # Write tables to parquet + buf1 = BytesIO() + df1.to_parquet(buf1) + buf2 = BytesIO() + df2.to_parquet(buf2) + + # Read tables back with cudf and compare with expected. + assert_eq( + cudf.read_parquet([buf1, buf2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([buf2, buf1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 52179f55da3..997ca357986 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -946,6 +946,66 @@ def test_minhash(): strings.str.minhash64(seeds=seeds) +def test_word_minhash(): + ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + + expected = cudf.Series( + [ + cudf.Series([21141582], dtype=np.uint32), + cudf.Series([962346254], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), + cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash(seeds=seeds) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + cudf.Series([2603139454418834912], dtype=np.uint64), + cudf.Series([5240044617220523711], dtype=np.uint64), + ] + ) + actual = ls.str.word_minhash64() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [ + 2603139454418834912, + 8644371945174847701, + 5541030711534384340, + ], + dtype=np.uint64, + ), + cudf.Series( + [5240044617220523711, 5847101123925041457, 153762819128779913], + dtype=np.uint64, + ), + ] + ) + actual = ls.str.word_minhash64(seeds=seeds) + assert_eq(expected, actual) + + # test wrong seed types + with pytest.raises(ValueError): + ls.str.word_minhash(seeds="a") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + ls.str.word_minhash(seeds=seeds) + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + ls.str.word_minhash64(seeds=seeds) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1627107b57d..1180da321e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -27,7 +27,7 @@ fsspec_parquet = None _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 +_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for @@ -275,10 +275,9 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight of 1970-01-01. If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default {row_group_size_bytes_val} +row_group_size_bytes: integer, default None Maximum size of each stripe of the output. - If None, {row_group_size_bytes_val} - ({row_group_size_bytes_val_in_mb} MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -346,10 +345,7 @@ See Also -------- cudf.read_parquet -""".format( - row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT, - row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024, -) +""" doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) _docstring_merge_parquet_filemetadata = """ diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py new file mode 100644 index 00000000000..326224c8fc0 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_main.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import tempfile +import textwrap + + +def _run_python(*, cudf_pandas, command): + executable = "python " + if cudf_pandas: + executable += "-m cudf.pandas " + return subprocess.run( + executable + command, + shell=True, + capture_output=True, + check=True, + text=True, + ) + + +def test_run_cudf_pandas_with_script(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f: + code = textwrap.dedent( + """ + import pandas as pd + df = pd.DataFrame({'a': [1, 2, 3]}) + print(df['a'].sum()) + """ + ) + f.write(code) + f.flush() + + res = _run_python(cudf_pandas=True, command=f.name) + expect = _run_python(cudf_pandas=False, command=f.name) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args(): + input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf(): + """Verify that cudf is active with -m cudf.pandas.""" + input_args_and_code = """-c 'import pandas as pd; print(pd)'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert "cudf" in res.stdout + assert "cudf" not in expect.stdout + + +def test_cudf_pandas_script_repl(): + def start_repl_process(cmd): + return subprocess.Popen( + cmd.split(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + + def get_repl_output(process, commands): + for command in commands: + process.stdin.write(command) + process.stdin.flush() + return process.communicate() + + p1 = start_repl_process("python -m cudf.pandas") + p2 = start_repl_process("python") + commands = [ + "import pandas as pd\n", + "print(pd.Series(range(2)).sum())\n", + "print(pd.Series(range(5)).sum())\n", + "import sys\n", + "print(pd.Series(list('abcd')), out=sys.stderr)\n", + ] + + res = get_repl_output(p1, commands) + expect = get_repl_output(p2, commands) + + # Check stdout + assert res[0] != "" + assert res[0] == expect[0] + + # Check stderr + assert res[1] != "" + assert res[1] == expect[1] + + p1.kill() + p2.kill() diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini index 817d98e6ba2..98459035298 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini @@ -1,3 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + [pytest] xfail_strict=true markers= @@ -5,3 +7,4 @@ markers= xfail_gold: this test is expected to fail in the gold pass xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass xfail_compare: this test is expected to fail in the comparison pass +addopts = --tb=native diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_polars/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/custreamz/README.md b/python/custreamz/README.md index 1509dac9e61..8da17ef09dc 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids ### Conda -cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel: +cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: Release: ```bash diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/custreamz/custreamz/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md index 6edb9f87d48..4655d2165f0 100644 --- a/python/dask_cudf/README.md +++ b/python/dask_cudf/README.md @@ -1,135 +1,63 @@ #
 Dask cuDF - A GPU Backend for Dask DataFrame
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame. - -## Using Dask cuDF - -### The Dask DataFrame API (Recommended) - -Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically: - -```python -import dask -dask.config.set({"dataframe.backend": "cudf"}) - -import dask.dataframe as dd -# This gives us a cuDF-backed dataframe -df = dd.read_parquet("data.parquet", ...) -``` +Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame. > [!IMPORTANT] -> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function: - -```python -import pandas as pd -import cudf - -# This gives us a Pandas-backed dataframe -dd.from_pandas(pd.DataFrame({"a": range(10)})) - -# This gives us a cuDF-backed dataframe -dd.from_pandas(cudf.DataFrame({"a": range(10)})) -``` - -A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend: - -```python -df = df.to_backend("pandas") -``` - -Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend: - -```python -df = df.to_backend("cudf") -``` - -### The Explicit Dask cuDF API - -In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API: - -```python -import dask_cudf - -# This always gives us a cuDF-backed dataframe -df = dask_cudf.read_parquet("data.parquet", ...) -``` - -> [!NOTE] -> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section). +> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently. -See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information. - -## Query Planning - -Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood. - -For example, the following user code will automatically benefit from predicate pushdown when the result is computed. - -```python -df = dd.read_parquet("/my/parquet/dataset/") -result = df.sort_values('B')['A'] -``` - -Unoptimized expression graph (`df.pprint()`): -``` -Projection: columns='A' - SortValues: by=['B'] shuffle_method='tasks' options={} - ReadParquetFSSpec: path='/my/parquet/dataset/' ... -``` +## Using Dask cuDF -Simplified expression graph (`df.simplify().pprint()`): -``` -Projection: columns='A' - SortValues: by=['B'] shuffle_method='tasks' options={} - ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ... -``` +Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF. -> [!NOTE] -> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself. +## Installation +See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages. -## Using Multiple GPUs and Multiple Nodes +## Resources -Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster. +- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/) +- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/) +- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) +- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) +- [Deployment](https://docs.rapids.ai/deployment/stable/) +- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. -> [!IMPORTANT] -> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs. +### Quick-start example -In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example: +A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern: ```python - +import dask +import dask.dataframe as dd from dask_cuda import LocalCUDACluster from distributed import Client -client = Client( +if __name__ == "__main__": + + # Define a GPU-aware cluster to leverage multiple GPUs + client = Client( LocalCUDACluster( - CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) - rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations - enable_cudf_spill=True, # Improve device memory stability - local_directory="/fast/scratch/", # Use fast local storage for spilling + CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) + rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations + enable_cudf_spill=True, # Improve device memory stability + local_directory="/fast/scratch/", # Use fast local storage for spilling ) -) + ) -df = dd.read_parquet("/my/parquet/dataset/") -agg = df.groupby('B').sum() -agg.compute() # This will use the cluster defined above -``` + # Set the default dataframe backend to "cudf" + dask.config.set({"dataframe.backend": "cudf"}) -> [!NOTE] -> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details. + # Create your DataFrame collection from on-disk + # or in-memory data + df = dd.read_parquet("/my/parquet/dataset/") -Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)). + # Use cudf-like syntax to transform and/or query your data + query = df.groupby('item')['price'].mean() -## Install - -See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages. + # Compute, persist, or write out the result + query.head() +``` -## Resources +If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory). -- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) -- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/) -- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) -- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) -- [Deployment](https://docs.rapids.ai/deployment/stable/) -- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. +If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions. diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index f60e4ff81ef..97e1dffc65b 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs): return self.to_backend("pandas", **kwargs) + def _prepare_cov_corr(self, min_periods, numeric_only): + # Upstream version of this method sets min_periods + # to 2 by default (which is not supported by cudf) + # TODO: Remove when cudf supports both min_periods + # and numeric_only + # See: https://github.com/rapidsai/cudf/issues/12626 + # See: https://github.com/rapidsai/cudf/issues/9009 + self._meta.cov(min_periods=min_periods) + + frame = self + if numeric_only: + numerics = self._meta._get_numeric_data() + if len(numerics.columns) != len(self.columns): + frame = frame[list(numerics.columns)] + return frame, min_periods + # var can be removed if cudf#15179 is addressed. - # See: https://github.com/rapidsai/cudf/issues/15179 + # See: https://github.com/rapidsai/cudf/issues/14935 def var( self, axis=0, diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e793d4381d1..a781b8242fe 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,7 +23,6 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -341,9 +340,7 @@ def write_partition( return_metadata=return_metadata, statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT - ), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), row_group_size_rows=kwargs.get("row_group_size_rows", None), max_page_size_bytes=kwargs.get("max_page_size_bytes", None), max_page_size_rows=kwargs.get("max_page_size_rows", None), @@ -365,7 +362,7 @@ def write_partition( statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT + "row_group_size_bytes", None ), row_group_size_rows=kwargs.get( "row_group_size_rows", None diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 905d8c08135..7aa0f6320f2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1007,3 +1007,20 @@ def test_to_backend_simplify(): df2 = df.to_backend("cudf")[["y"]].simplify() df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify() assert df2._name == df3._name + + +@pytest.mark.parametrize("numeric_only", [True, False]) +@pytest.mark.parametrize("op", ["corr", "cov"]) +def test_cov_corr(op, numeric_only): + df = cudf.DataFrame.from_dict( + { + "x": np.random.randint(0, 5, size=10), + "y": np.random.normal(size=10), + } + ) + ddf = dd.from_pandas(df, npartitions=2) + res = getattr(ddf, op)(numeric_only=numeric_only) + # Use to_pandas until cudf supports numeric_only + # (See: https://github.com/rapidsai/cudf/issues/12626) + expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) + dd.assert_eq(res, expect) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 0c352a5068b..f2dd22f43aa 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &seeds, const size_type width, ) except + + + cdef unique_ptr[column] word_minhash( + const column_view &input, + const column_view &seeds + ) except + + + cdef unique_ptr[column] word_minhash64( + const column_view &input, + const column_view &seeds + ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index c2fb5f0dce4..eac0f748257 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] like( column_view source_strings, string_scalar pattern, - string_scalar escape) except + + string_scalar escape_character) except + + + cdef unique_ptr[column] like( + column_view source_strings, + column_view patterns, + string_scalar escape_character) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd index 12cd628fc1f..b7166167cfd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd @@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + + + cdef unique_ptr[column] extract_all_record( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index 410ff58f299..59262820411 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] repeat_strings( - column_view strings, - size_type repeat) except + + column_view input, + size_type repeat_times) except + cdef unique_ptr[column] repeat_strings( - column_view strings, - column_view repeats) except + + column_view input, + column_view repeat_times) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..d3065cf8667 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx + regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..6848c8e6e86 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,6 +5,7 @@ from . cimport ( case, char_types, contains, + extract, find, regex_flags, regex_program, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..bba86e818cc 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,9 +5,11 @@ case, char_types, contains, + extract, find, regex_flags, regex_program, + repeat, replace, slice, ) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd index 2cd4891a0ea..6146a1119d6 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/strings/contains.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column contains_re(Column input, RegexProgram prog) + +cpdef Column count_re(Column input, RegexProgram prog) + +cpdef Column matches_re(Column input, RegexProgram prog) + +cpdef Column like( + Column input, + ColumnOrScalar pattern, + Scalar escape_character = * +) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 1a2446f6e2c..82bd1fbea32 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -1,8 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cython.operator import dereference + from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram @@ -32,9 +38,131 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = cpp_contains.contains_re( + result = move(cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column count_re( + Column input, + RegexProgram prog +): + """Returns the number of times the given regex_program's pattern + matches in each string. + + For details, see :cpp:func:`cudf::strings::count_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of match counts for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.count_re( input.view(), prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column matches_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which + matching the given regex_program object but only at + the beginning the string. + + For details, see :cpp:func:`cudf::strings::matches_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.matches_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None): + """ + Returns a boolean column identifying rows which + match the given like pattern. + + For details, see :cpp:func:`cudf::strings::like`. + + Parameters + ---------- + input : Column + The input strings + pattern : Column or Scalar + Like patterns to match within each string + escape_character : Scalar + Optional character specifies the escape prefix. + Default is no escape character. + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + cdef unique_ptr[column] result + + if escape_character is None: + escape_character = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) ) + cdef const string_scalar* c_escape_character = ( + escape_character.c_obj.get() + ) + cdef const string_scalar* c_pattern + + if ColumnOrScalar is Column: + with nogil: + result = move(cpp_contains.like( + input.view(), + pattern.view(), + dereference(c_escape_character) + )) + elif ColumnOrScalar is Scalar: + c_pattern = (pattern.c_obj.get()) + with nogil: + result = move(cpp_contains.like( + input.view(), + dereference(c_pattern), + dereference(c_escape_character) + )) + else: + raise ValueError("pattern must be a Column or a Scalar") + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd new file mode 100644 index 00000000000..3871f5a0e4e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog) + +cpdef Column extract_all_record(Column input, RegexProgram prog) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx new file mode 100644 index 00000000000..dcb11ca10ce --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport extract as cpp_extract +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog): + """ + Returns a table of strings columns where each column + corresponds to the matching group specified in the given + egex_program object. + + For details, see :cpp:func:`cudf::strings::extract`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Table + Columns of strings extracted from the input column. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column extract_all_record(Column input, RegexProgram prog): + """ + Returns a lists column of strings where each string column + row corresponds to the matching group specified in the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::extract_all_record`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Column + Lists column containing strings extracted from the input column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd new file mode 100644 index 00000000000..bc70926b6fa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnorSizeType: + Column + size_type + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx new file mode 100644 index 00000000000..5f627218f6e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport repeat as cpp_repeat +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): + """ + Repeat each string in the given strings column by the numbers + of times given in another numeric column. + + For details, see :cpp:func:`cudf::strings::repeat`. + + Parameters + ---------- + input : Column + The column containing strings to repeat. + repeat_times : Column or int + Number(s) of times that the corresponding input strings + for each row are repeated. + + Returns + ------- + Column + New column containing the repeated strings. + """ + cdef unique_ptr[column] c_result + + if ColumnorSizeType is Column: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() + ) + ) + elif ColumnorSizeType is size_type: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times + ) + ) + else: + raise ValueError("repeat_times must be size_type or integer") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini index 1761c0f011c..f572f85ca49 100644 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/tests/pytest.ini @@ -6,3 +6,4 @@ filterwarnings = error ignore:::.*xdist.* ignore:::.*pytest.* +addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4f88e09183f..4e4dd7cbb00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat): pa_target_col, pa_target_scalar.as_py() ) assert_column_eq(got, expected) + + +def test_count_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["A1a2A3a4", "A1A2A3", None]) + result = plc.strings.contains.count_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.count_substring_regex(arr, pattern) + assert_column_eq(result, expected) + + +def test_match_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["1a2b", "b1a2", None]) + result = plc.strings.contains.matches_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.match_substring_regex(arr, f"^{pattern}") + assert_column_eq(result, expected) + + +def test_like(): + pattern = "%a" + arr = pa.array(["1a2aa3aaa"]) + result = plc.strings.contains.like( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(pa.array([pattern])), + ) + expected = pc.match_like(arr, pattern) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py new file mode 100644 index 00000000000..788b86423c4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_extract(): + pattern = "([ab])(\\d)" + pa_pattern = "(?P[ab])(?P\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pc.extract_regex(arr, pa_pattern) + for i, result_col in enumerate(result.itercolumns()): + expected_col = pa.chunked_array(expected.field(i)) + assert result_col.fill_null("").equals(expected_col) + + +def test_extract_all_record(): + pattern = "([ab])(\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract_all_record( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + [pa.array([["a", "1"], ["b", "2"], None], type=result.type)] + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py new file mode 100644 index 00000000000..18b5d8bf4d0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest + + +@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) +def test_repeat_strings(repeats): + arr = pa.array(["1", None]) + plc_result = plc.strings.repeat.repeat_strings( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(repeats) + if not isinstance(repeats, int) + else repeats, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.binary_repeat(arr, repeats)) + assert result.equals(expected)