diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d6d3e3fdd33..b5d17022a3a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-publish-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index d670132cca9..10c803f7921 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a4a8f036174..b515dbff9f3 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,7 +37,7 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 if: always() with: needs: ${{ toJSON(needs) }} @@ -52,7 +52,7 @@ jobs: steps: - name: Get PR info id: get-pr-info - uses: rapidsai/shared-actions/get-pr-info@main + uses: nv-gha-runners/get-pr-info@main - name: Checkout code repo uses: actions/checkout@v4 with: @@ -104,39 +104,39 @@ jobs: - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_cpp == 'true' with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -145,7 +145,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -153,7 +153,7 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_java == 'true' with: build_type: pull-request @@ -164,7 +164,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -174,7 +174,7 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_notebooks == 'true' with: build_type: pull-request @@ -185,7 +185,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -195,7 +195,7 @@ jobs: wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -204,21 +204,21 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -226,7 +226,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -235,7 +235,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -247,7 +247,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -256,7 +256,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -265,7 +265,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 with: arch: '["amd64"]' cuda: '["12.5"]' @@ -276,7 +276,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -287,7 +287,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -299,7 +299,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index fe77ad4b6b2..45e5191eb54 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4af6a0d690d..8605fa46f68 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,7 +45,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +85,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +97,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -126,7 +126,7 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 6c5f4a68a4c..abc6f74fccf 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp) # ################################################################################################## # * nds-h benchmark -------------------------------------------------------------------------------- -ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp) # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 73ff17b2b93..940d03cdb41 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer& inda * @brief Normalize unquoted whitespace (space and tab characters) using FST * * @param indata Input device buffer + * @param col_offsets Offsets to column contents in input buffer + * @param col_lengths Length of contents of each row in column * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation + * + * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents + * of each row */ -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace io::json::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ed7b2ac0850..ee03a382bec 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,8 +39,9 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; ///< 128MB per row group -constexpr size_type default_row_group_size_rows = 1000000; ///< 1 million rows per row group +constexpr size_t default_row_group_size_bytes = + std::numeric_limits::max(); ///< Infinite bytes per row group +constexpr size_type default_row_group_size_rows = 1'000'000; ///< 1 million rows per row group constexpr size_t default_max_page_size_bytes = 512 * 1024; ///< 512KB per page constexpr size_type default_max_page_size_rows = 20000; ///< 20k rows per page constexpr int32_t default_column_index_truncate_length = 64; ///< truncate to 64 bytes diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 8890c786287..756047d383a 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -625,6 +626,8 @@ void make_device_json_column(device_span input, auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); std::vector is_mixed_type_column(num_columns, 0); std::vector is_pruned(num_columns, 0); + // for columns that are not mixed type but have been forced as string + std::vector forced_as_string_column(num_columns); columns.try_emplace(parent_node_sentinel, std::ref(root)); std::function remove_child_columns = @@ -695,11 +698,14 @@ void make_device_json_column(device_span input, // Struct, List, String, Value auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // if parent is mixed type column or this column is pruned, ignore this column. + // if parent is mixed type column or this column is pruned or if parent + // has been forced as string, ignore this column. if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || + forced_as_string_column[parent_col_id]) { ignore_vals[this_col_id] = 1; if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } continue; } @@ -765,22 +771,26 @@ void make_device_json_column(device_span input, } auto this_column_category = column_categories[this_col_id]; - if (is_enabled_mixed_types_as_string) { - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - is_mixed_type_column[this_col_id] = 1; - this_column_category = NC_STR; - } + // get path of this column, check if it is a struct/list forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + this_column_category = NC_STR; } CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); // move into parent device_json_column col(stream, mr); initialize_json_columns(this_col_id, col, this_column_category); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + col.forced_as_string_column = true; + forced_as_string_column[this_col_id] = true; + } + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); if (not replaced) parent_col.column_order.push_back(name); @@ -802,12 +812,30 @@ void make_device_json_column(device_span input, is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + } + + // ignore all children of columns forced as string + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { + forced_as_string_column[this_col_id] = true; + ignore_vals[this_col_id] = 1; + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and + forced_as_string_column[this_col_id]) + column_categories[this_col_id] = NC_STR; } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { @@ -982,39 +1010,58 @@ std::pair, std::vector> device_json_co "string offset, string length mismatch"); rmm::device_uvector d_string_data(col_size, stream); // TODO how about directly storing pair in json_column? - auto offset_length_it = - thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - data_type target_type{}; + auto [result_bitmask, null_count] = make_validity(json_col); - if (schema.has_value()) { + data_type target_type{}; + std::unique_ptr col{}; + if (options.normalize_whitespace && json_col.forced_as_string_column) { + CUDF_EXPECTS(prune_columns || options.mixed_types_as_string, + "Whitespace normalization of nested columns requested as string requires " + "either prune_columns or mixed_types_as_string to be enabled"); + auto [normalized_d_input, col_offsets, col_lengths] = + cudf::io::json::detail::normalize_whitespace( + d_input, json_col.string_offsets, json_col.string_lengths, stream, mr); + auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin()); + target_type = data_type{type_id::STRING}; + // Convert strings to the inferred data type + col = parse_data(normalized_d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); + } else { + auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), + json_col.string_lengths.begin()); + if (schema.has_value()) { #ifdef NJP_DEBUG_PRINT - std::cout << "-> explicit type: " - << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) - : "n/a"); + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); #endif - target_type = schema.value().type; - } else if (json_col.forced_as_string_column) { - target_type = data_type{type_id::STRING}; - } - // Infer column type, if we don't have an explicit type for it - else { - target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, offset_length_it, col_size, stream); + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + options.json_view(), d_input, offset_length_it, col_size, stream); + } + // Convert strings to the inferred data type + col = parse_data(d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); } - auto [result_bitmask, null_count] = make_validity(json_col); - // Convert strings to the inferred data type - auto col = parse_data(d_input.data(), - offset_length_it, - col_size, - target_type, - std::move(result_bitmask), - null_count, - options.view(), - stream, - mr); - // Reset nullable if we do not have nulls // This is to match the existing JSON reader's behaviour: // - Non-string columns will always be returned as nullable @@ -1120,11 +1167,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref()); // gpu tree generation - return get_tree_representation(tokens_gpu, - token_indices_gpu, - options.is_enabled_mixed_types_as_string(), - stream, - cudf::get_current_device_resource_ref()); + // Note that to normalize whitespaces in nested columns coerced to be string, we need the column + // to either be of mixed type or we need to request the column to be returned as string by + // pruning it with the STRING dtype + return get_tree_representation( + tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(), + stream, + cudf::get_current_device_resource_ref()); }(); // IILE used to free memory of token data. #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 97d5884fef1..2d435dc8e1a 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -17,6 +17,7 @@ #include "io/fst/lookup_tables.cuh" #include +#include #include #include #include @@ -25,8 +26,17 @@ #include #include #include - +#include + +#include +#include +#include +#include +#include +#include #include +#include +#include #include #include @@ -215,14 +225,6 @@ std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - * Such strings are not part of the JSON standard (characters allowed within quotes should - * have ASCII at least 0x20 i.e. space character and above) but may be encountered while - * reading JSON files */ enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; // Aliases for readability of the transition table @@ -255,17 +257,17 @@ struct TransduceToNormalizedWS { // Let the alphabet set be Sigma // --------------------------------------- // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol + // Input symbol translates to output symbol // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma + // DQS | Sigma -> + // OOS | Sigma\{,\t} -> + // DEC | Sigma -> // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> + // Output symbol same as input symbol + // OOS | {} -> {} + // OOS | {\t} -> {\t} - // Case when read symbol is a space or tab but is unquoted + // Case when read symbol is not an unquoted space or tab // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function // However, since there is no output in this case i.e. the count returned by // operator()(state_id, match_id, read_symbol) is zero, this function is never called. @@ -287,8 +289,8 @@ struct TransduceToNormalizedWS { SymbolT const read_symbol) const { // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { + if (!(match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS))) { return 0; } return 1; @@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer& inda std::swap(indata, outdata); } -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - static constexpr std::int32_t min_out = 0; - static constexpr std::int32_t max_out = 2; + /* + * Algorithm: + 1. Create a single buffer by concatenating the rows of the string column. Create segment offsets + and lengths array for concatenated buffer + 2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted + whitespace characters, and outputs indices of unquoted whitespace characters + 3. Update segment lengths based on the number of output indices between segment offsets + 4. Remove characters at output indices from concatenated buffer. + 5. Return updated buffer, segment lengths and updated segment offsets + */ + auto inbuf_lengths = cudf::detail::make_device_uvector_async( + col_lengths, stream, cudf::get_current_device_resource_ref()); + size_t inbuf_lengths_size = inbuf_lengths.size(); + size_type inbuf_size = + thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end()); + rmm::device_uvector inbuf(inbuf_size, stream); + rmm::device_uvector inbuf_offsets(inbuf_lengths_size, stream); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + auto input_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__( + size_t i) -> char const* { return &d_input[col_offsets[i]]; })); + auto output_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__( + size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; })); + + { + // cub device batched copy + size_t temp_storage_bytes = 0; + cub::DeviceCopy::Batched(nullptr, + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + rmm::device_buffer temp_storage(temp_storage_bytes, stream); + cub::DeviceCopy::Batched(temp_storage.data(), + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + } + + // whitespace normalization : get the indices of the unquoted whitespace characters auto parser = fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), - fst::detail::make_translation_functor( + fst::detail::make_translation_functor( normalize_whitespace::TransduceToNormalizedWS{}), stream); - rmm::device_buffer outbuf(indata.size(), stream, mr); - rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(indata.data()), - static_cast(indata.size()), - static_cast(outbuf.data()), + rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_indices_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), thrust::make_discard_iterator(), - outbuf_size.data(), + outbuf_indices.data(), + outbuf_indices_size.data(), normalize_whitespace::start_state, stream); - outbuf.resize(outbuf_size.value(stream), stream); - datasource::owning_buffer outdata(std::move(outbuf)); - std::swap(indata, outdata); + auto const num_deletions = outbuf_indices_size.value(stream); + outbuf_indices.resize(num_deletions, stream); + + // now these indices need to be removed + // TODO: is there a better way to do this? + thrust::for_each( + rmm::exec_policy_nosync(stream), + outbuf_indices.begin(), + outbuf_indices.end(), + [inbuf_offsets_begin = inbuf_offsets.begin(), + inbuf_offsets_end = inbuf_offsets.end(), + inbuf_lengths = inbuf_lengths.begin()] __device__(size_type idx) { + auto it = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx); + auto pos = thrust::distance(inbuf_offsets_begin, it) - 1; + cuda::atomic_ref ref{*(inbuf_lengths + pos)}; + ref.fetch_add(-1, cuda::std::memory_order_relaxed); + }); + + auto stencil = cudf::detail::make_zeroed_device_uvector_async( + static_cast(inbuf_size), stream, cudf::get_current_device_resource_ref()); + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_constant_iterator(true), + thrust::make_constant_iterator(true) + num_deletions, + outbuf_indices.begin(), + stencil.begin()); + thrust::remove_if(rmm::exec_policy_nosync(stream), + inbuf.begin(), + inbuf.end(), + stencil.begin(), + thrust::identity()); + inbuf.resize(inbuf_size - num_deletions, stream); + + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + stream.synchronize(); + return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)}; } } // namespace detail diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4e513d3495c..1c15e147b13 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - parse_opts.dayfirst = options.is_enabled_dayfirst(); - parse_opts.keepquotes = options.is_enabled_keep_quotes(); - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.dayfirst = options.is_enabled_dayfirst(); + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.normalize_whitespace = options.is_enabled_normalize_whitespace(); + parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); std::vector na_values{"", "null"}; na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index bd82b040359..99a5b17bce8 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span> sources, normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); } - // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is - // enabled, invoke pre-processing FST - if (reader_opts.is_enabled_normalize_whitespace()) { - normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref()); - } - auto buffer = cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); stream.synchronize(); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 81fd4ab9f82..ec05f35d405 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const table_size = std::reduce(column_sizes.begin(), column_sizes.end()); auto const avg_row_len = util::div_rounding_up_safe(table_size, input.num_rows()); if (avg_row_len > 0) { - auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len); - max_page_fragment_size = std::min(rg_frag_size, max_page_fragment_size); + // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size + // value (=uint64::max) to avoid a sign overflow when comparing + auto const rg_frag_size = + std::min(std::numeric_limits::max(), + util::div_rounding_up_safe(max_row_group_size, avg_row_len)); + // Safe comparison as rg_frag_size fits in size_type + max_page_fragment_size = + std::min(static_cast(rg_frag_size), max_page_fragment_size); } // dividing page size by average row length will tend to overshoot the desired diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index bc2722441d0..734067582f7 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -67,6 +67,8 @@ struct parse_options_view { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::trie_view trie_true; cudf::detail::trie_view trie_false; cudf::detail::trie_view trie_na; @@ -85,6 +87,8 @@ struct parse_options { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; cudf::detail::optional_trie trie_na; @@ -111,6 +115,8 @@ struct parse_options { doublequote, dayfirst, skipblanklines, + normalize_whitespace, + mixed_types_as_string, cudf::detail::make_trie_view(trie_true), cudf::detail::make_trie_view(trie_false), cudf::detail::make_trie_view(trie_na), diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 86402a0e7de..573101cefd9 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -51,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using semi_map_type = cuco::legacy::static_map>; - using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 19701816867..89c13285cfe 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,6 +25,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -160,6 +161,38 @@ struct pair_expression_equality : public expression_equality { } }; +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +struct double_row_equality_comparator { + row_equality const equality_comparator; + row_equality const conditional_comparator; + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } +}; + +// A CUDA Cooperative Group of 4 threads for the hash set. +auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4; + +// The hash set type used by mixed_semi_join with the build_table. +using hash_set_type = cuco::static_set, + cuda::thread_scope_device, + double_row_equality_comparator, + cuco::linear_probing, + cudf::detail::cuco_allocator, + cuco::storage<1>>; + +// The hash_set_ref_type used by mixed_semi_join kerenels for probing. +using hash_set_ref_type = hash_set_type::ref_type; + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 7459ac3e99c..f2c5ff13638 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,12 +38,16 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { + auto constexpr cg_size = hash_set_ref_type::cg_size; + + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between @@ -52,24 +56,24 @@ CUDF_KERNEL void __launch_bounds__(block_size) cudf::ast::detail::IntermediateDataType* intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = left_num_rows; + &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates]; - cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; + cudf::size_type const outer_num_rows = left_table.num_rows(); + auto const outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; auto evaluator = cudf::ast::detail::expression_evaluator( left_table, right_table, device_expression_data); if (outer_row_index < outer_num_rows) { + // Make sure to swap_tables here as hash_set will use probe table as the left one. + auto constexpr swap_tables = true; // Figure out the number of elements for this key. auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, false, equality_probe}; + evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - left_table_keep_mask[outer_row_index] = - hash_table_view.contains(outer_row_index, hash_probe, equality); + auto const set_ref_equality = set_ref.with_key_eq(equality); + auto const result = set_ref_equality.contains(tile, outer_row_index); + if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result; } } @@ -78,9 +82,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -94,9 +97,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } else { @@ -106,9 +108,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index 43714ffb36a..b08298e64e4 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,9 +45,8 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] hash_table_view The hash table built from `build`. + * @param[in] set_ref The hash table device view built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index cfb785e242c..719b1d47105 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -46,45 +46,6 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Device functor to create a pair of hash value and index for a given row. - */ -struct make_pair_function_semi { - __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept - { - // The value is irrelevant since we only ever use the hash map to check for - // membership of a particular row index. - return cuco::make_pair(static_cast(i), 0); - } -}; - -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -class double_row_equality { - public: - double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) - : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} - { - } - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } - - private: - row_equality _equality_comparator; - row_equality _conditional_comparator; -}; - -} // namespace - std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -96,7 +57,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -137,7 +98,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || + cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -156,27 +117,20 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); + cudf::experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - semi_map_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); + // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -191,20 +145,28 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create(right_conditional, stream); + cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); + hash_set_type row_set{ + {compute_hash_table_size(build.num_rows())}, + cuco::empty_key{JoinNoneValue}, + {equality_build_equality, equality_build_conditional}, + {row_hash_build.device_hasher(build_nulls)}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + {stream.value()}}; + + auto iter = thrust::make_counting_iterator(0); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); + row_set.insert(iter, iter + right_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -212,18 +174,19 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); + row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value()); } - auto hash_table_view = hash_table.get_device_view(); - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; + auto const shmem_size_per_block = + parser.shmem_per_thread * + cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); + hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); + // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -232,9 +195,8 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, - hash_probe, equality_probe, - hash_table_view, + row_set_ref, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 960c19fce2e..48bc982d0e3 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, JsonDtypeSchema) +{ + std::string data = R"( + {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {"0": "lolol "}, "c": true} + )"; + + std::map dtype_schema{{"c", {data_type{type_id::STRING}}}, + {"b", {data_type{type_id::STRING}}}, + {"a", {dtype()}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .prune_columns(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 3); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + + // cudf::column::contents contents = result.tbl->get_column(1).release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), + cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}", + "{\"0\": \"abc\" }", + "{\"0\": \"lolol \"}"}), + cudf::test::debug_output_level::ALL_ERRORS); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), + cudf::test::strings_column_wrapper({"true", "false", "true"}), + cudf::test::debug_output_level::ALL_ERRORS); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu index 6d79fdc98ef..6a3bd69de81 100644 --- a/cpp/tests/io/json/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu @@ -34,129 +34,127 @@ // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) -{ - // Prepare cuda stream for data transfers & kernels - auto stream_view = cudf::test::get_default_stream(); - - auto device_input = rmm::device_buffer( - host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref()); - - // Preprocessing FST - cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_whitespace( - device_data, stream_view, cudf::get_current_device_resource_ref()); - - std::string preprocessed_host_output(device_data.size(), 0); - CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), - device_data.data(), - preprocessed_host_output.size(), - cudaMemcpyDeviceToHost, - stream_view.value())); - - stream_view.synchronize(); - ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL( - preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); -} - -TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) +TEST_F(JsonWSNormalizationTest, ReadJsonOption) { - std::string input = R"({ "A" : "TEST" })"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces) -{ - std::string input = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})"; - std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})"; - run_test(input, output); -} + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString) -{ - std::string input = R"({" a ":50})"; - std::string output = R"({" a ":50})"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString) -{ - std::string input = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}"; - std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); -TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs) -{ - std::string input = "{\"a\":\t\"b\"}"; - std::string output = R"({"a":"b"})"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows) { - std::string input = "{\"A\" : \t\"TEST\" }"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs) -{ - std::string input = - "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, " - "\"bar\trapids\": 456 }"; - std::string output = - "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}"; - run_test(input, output); -} + // Test input + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" } ] } } + )"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample) -{ - std::string input = R"([{"a":50}, {"a" : 60}])"; - std::string output = R"([{"a":50},{"a":60}])"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired) -{ - std::string input = R"({"a\\n\r\a":50})"; - std::string output = R"({"a\\n\r\a":50})"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"}] } } + )"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) -{ - std::string input = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}"; - std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, ReadJsonOption) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType) { // When mixed type fields are read as strings, the table read will differ depending the // value of normalize_whitespace // Test input - std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" }, { "YY" : "abc" } ] } } + { "Root": { "Key": [ { "YY" : "abc" } ] } } + )"; + + std::map dtype_schema{ + {"Key", {cudf::data_type{cudf::type_id::STRING}}}}; + cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(true); + .prune_columns(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); // Expected table - std::string const expected_input = R"({ "a" : {"b":"c"}})"; + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } , { "YY" : 2 } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } } + { "Root": { "Key": [{"YY":"abc"}] } } + )"; + cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(false); + .prune_columns(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 6c147c8a128..08a0136700d 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,6 +778,21 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {2, 7, 8}); +} + TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -900,3 +915,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } + +TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {0, 1, 3, 4, 5, 6, 9}); +} diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index a0155671a26..e6c9d60b05b 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -438,7 +438,7 @@ def write_parquet( object statistics="ROWGROUP", object metadata_file_path=None, object int96_timestamps=False, - object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + object row_group_size_bytes=None, object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, @@ -616,9 +616,9 @@ cdef class ParquetWriter: Name of the compression to use. Use ``None`` for no compression. statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' Level at which column statistics should be included in file. - row_group_size_bytes: int, default 134217728 + row_group_size_bytes: int, default ``uint64 max`` Maximum size of each stripe of the output. - By default, 134217728 (128MB) will be used. + By default, a virtually infinite size equal to ``uint64 max`` will be used. row_group_size_rows: int, default 1000000 Maximum number of rows of each stripe of the output. By default, 1000000 (10^6 rows) will be used. @@ -661,11 +661,11 @@ cdef class ParquetWriter: def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", - int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - int row_group_size_rows=1000000, - int max_page_size_bytes=524288, - int max_page_size_rows=20000, - int max_dictionary_size=1048576, + size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + size_type row_group_size_rows=1000000, + size_t max_page_size_bytes=524288, + size_type max_page_size_rows=20000, + size_t max_dictionary_size=1048576, bool use_dictionary=True, bool store_schema=False): filepaths_or_buffers = ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58a16a6d504..d73ad8225ca 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6840,7 +6840,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 62be7378e9e..ce99f98b559 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -64,7 +64,7 @@ def _write_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -149,7 +149,7 @@ def write_to_dataset( return_metadata=False, statistics="ROWGROUP", int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -205,7 +205,7 @@ def write_to_dataset( If ``False``, timestamps will not be altered. row_group_size_bytes: integer or None, default None Maximum size of each stripe of the output. - If None, 134217728 (128MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -980,7 +980,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1627107b57d..1180da321e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -27,7 +27,7 @@ fsspec_parquet = None _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 +_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for @@ -275,10 +275,9 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight of 1970-01-01. If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default {row_group_size_bytes_val} +row_group_size_bytes: integer, default None Maximum size of each stripe of the output. - If None, {row_group_size_bytes_val} - ({row_group_size_bytes_val_in_mb} MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -346,10 +345,7 @@ See Also -------- cudf.read_parquet -""".format( - row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT, - row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024, -) +""" doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) _docstring_merge_parquet_filemetadata = """ diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e793d4381d1..a781b8242fe 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,7 +23,6 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -341,9 +340,7 @@ def write_partition( return_metadata=return_metadata, statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT - ), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), row_group_size_rows=kwargs.get("row_group_size_rows", None), max_page_size_bytes=kwargs.get("max_page_size_bytes", None), max_page_size_rows=kwargs.get("max_page_size_rows", None), @@ -365,7 +362,7 @@ def write_partition( statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT + "row_group_size_bytes", None ), row_group_size_rows=kwargs.get( "row_group_size_rows", None