From 0870051b6fbe8ad5a5cec93035d1784e9b18cbd8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 23 Sep 2024 11:41:42 -0500 Subject: [PATCH 1/7] Improve Polars docs (#16820) This PR improves the docs by reducing the size of the Polars heading (too many words) and tightening up the writing of the docs page. --------- Co-authored-by: Ray Douglass --- .github/workflows/build.yaml | 2 +- .github/workflows/pr.yaml | 6 +++--- .github/workflows/test.yaml | 6 +++--- docs/cudf/source/cudf_polars/index.rst | 12 ++++++------ 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 2e5959338b0..379f39ac965 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 25f11863b0d..0fe4533f68e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -89,7 +89,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -109,7 +109,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -119,7 +119,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/build_docs.sh" wheel-build-cudf: needs: checks diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 36c9088d93c..a10117a45e6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -81,7 +81,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -93,7 +93,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst index cc7aabd124f..0a3a0d86b2c 100644 --- a/docs/cudf/source/cudf_polars/index.rst +++ b/docs/cudf/source/cudf_polars/index.rst @@ -1,7 +1,7 @@ -cuDF-based GPU backend for Polars [Open Beta] -============================================= +Polars GPU engine +================= -cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API. +cuDF provides an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API. The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine @@ -16,7 +16,7 @@ We reproduced the `Polars Decision Support (PDS) `__ on the Polars website. +The GPU engine for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page `__ on the Polars website. Launch on Google Colab ---------------------- @@ -38,4 +38,4 @@ Launch on Google Colab :width: 200px :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb - Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab `__. + Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab `__. From 389208c9a46fd6583efacfe9c1875c862e8d0c90 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 23 Sep 2024 14:03:57 -0500 Subject: [PATCH 2/7] Ignore numba warning specific to ARM runners (#16872) This PR ignores numba warnings that are showing up in arm runners: https://github.com/numba/numba/issues/6589#issuecomment-748595076 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16872 --- python/cudf/cudf/tests/pytest.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index 8a594794fac..d05ba9aaacc 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -14,4 +14,6 @@ filterwarnings = ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning + # Ignore numba PEP 456 warning specific to arm machines + ignore:FNV hashing is not implemented in Numba.*:UserWarning addopts = --tb=native From 8b12cf4e66b4b1f8ec248493c27deb65ee625bbf Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 23 Sep 2024 15:35:32 -0500 Subject: [PATCH 3/7] Update fmt (to 11.0.2) and spdlog (to 1.14.1). (#16806) ## Description Replaces #15603 Contributes to: * https://github.com/rapidsai/build-planning/issues/54 * https://github.com/rapidsai/build-planning/issues/56 * https://github.com/rapidsai/rapids-cmake/issues/387 Now that most of `conda-forge` has been updated to `fmt >=11.0.1,<12` and `spdlog>=1.14.1,<1.15` (https://github.com/rapidsai/build-planning/issues/56#issuecomment-2334281452), we're attempting to upgrade RAPIDS to similar versions of those libraries. This improves the likelihood that RAPIDS will be installable alongside newer versions of its dependencies and complementary packages on conda-forge. ## Notes for Reviewers This PR is testing changes made in https://github.com/rapidsai/rapids-cmake/pull/689. It shouldn't be merged until those `rapids-cmake` changes are merged and any testing-specific details have been removed. --- .../all_cuda-118_arch-x86_64.yaml | 4 ++-- .../all_cuda-125_arch-x86_64.yaml | 4 ++-- conda/recipes/libcudf/conda_build_config.yaml | 4 ++-- cpp/CMakeLists.txt | 2 +- cpp/cmake/thirdparty/get_spdlog.cmake | 21 ++++++------------- dependencies.yaml | 4 ++-- 6 files changed, 15 insertions(+), 24 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index c96e8706d27..16b3d112992 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -31,7 +31,7 @@ dependencies: - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -84,7 +84,7 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index e54a44d9f6e..cce2e0eea84 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -32,7 +32,7 @@ dependencies: - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -82,7 +82,7 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 33fa4b4eccf..dc75eb4b252 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -26,13 +26,13 @@ librdkafka_version: - ">=2.5.0,<2.6.0a0" fmt_version: - - ">=10.1.1,<11" + - ">=11.0.2,<12" flatbuffers_version: - "=24.3.25" spdlog_version: - - ">=1.12.0,<1.13" + - ">=1.14.1,<1.15" nvcomp_version: - "=4.0.1" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 26c086046a8..84b462bb884 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -798,7 +798,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm $ + PUBLIC CCCL::CCCL rmm::rmm $ spdlog::spdlog_header_only PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake index c0e07d02d94..90b0f4d8a8e 100644 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,21 +16,12 @@ function(find_and_configure_spdlog) include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET cudf-exports) - rapids_export_package(BUILD spdlog cudf-exports) + rapids_cpm_spdlog( + FMT_OPTION "EXTERNAL_FMT_HO" + INSTALL_EXPORT_SET cudf-exports + BUILD_EXPORT_SET cudf-exports + ) - if(spdlog_ADDED) - rapids_export( - BUILD spdlog - EXPORT_SET spdlog - GLOBAL_TARGETS spdlog spdlog_header_only - NAMESPACE spdlog:: - ) - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) - endif() endfunction() find_and_configure_spdlog() diff --git a/dependencies.yaml b/dependencies.yaml index 2f2d7ba679e..01edcb3889a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -350,12 +350,12 @@ dependencies: common: - output_types: conda packages: - - fmt>=10.1.1,<11 + - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 # Align nvcomp version with rapids-cmake - nvcomp==4.0.1 - - spdlog>=1.12.0,<1.13 + - spdlog>=1.14.1,<1.15 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] From 6badd6b183e966f7f882708a0f4b2c4d0f2b5368 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 24 Sep 2024 08:17:53 -0500 Subject: [PATCH 4/7] Add in support for setting delim when parsing JSON through java (#16867) (#16880) This is a back-port of #16867 to 24.10. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Alessandro Bellina (https://github.com/abellina) URL: https://github.com/rapidsai/cudf/pull/16880 --- .../main/java/ai/rapids/cudf/JSONOptions.java | 16 ++++++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 19 ++++++++++++++----- java/src/main/native/src/TableJni.cpp | 12 ++++++++++-- .../test/java/ai/rapids/cudf/TableTest.java | 19 ++++++++++++++++++- 4 files changed, 58 insertions(+), 8 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index c8308ca17ec..17b497be5ee 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowLeadingZeros; private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; + private final byte lineDelimiter; private JSONOptions(Builder builder) { super(builder); @@ -52,6 +53,11 @@ private JSONOptions(Builder builder) { allowLeadingZeros = builder.allowLeadingZeros; allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; + lineDelimiter = builder.lineDelimiter; + } + + public byte getLineDelimiter() { + return lineDelimiter; } public boolean isDayFirst() { @@ -123,6 +129,16 @@ public static final class Builder extends ColumnFilterOptions.Builder Byte.MAX_VALUE) { + throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter); + } + lineDelimiter = (byte)delimiter; + return this; + } + /** * Should json validation be strict or not */ diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 09da43374ae..19c72809cea 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -258,7 +258,8 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + byte lineDelimiter) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, int[] dTypeIds, int[] dTypeScales, @@ -272,6 +273,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines, @@ -284,6 +286,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSON(long address, long length, @@ -297,7 +300,8 @@ private static native long readAndInferJSON(long address, long length, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + byte lineDelimiter) throws CudfException; /** * Read in Parquet formatted data. @@ -1321,7 +1325,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, -1); } @@ -1404,7 +1409,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars())); + opts.unquotedControlChars(), + opts.getLineDelimiter())); } /** @@ -1426,6 +1432,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + opts.getLineDelimiter(), dsHandle)); return twm; } finally { @@ -1479,7 +1486,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1518,6 +1526,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + opts.getLineDelimiter(), dsHandle))) { return gatherJSONColumns(schema, twm, emptyRowCount); } finally { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 92e213bcb60..96d4c2c4eeb 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1646,6 +1647,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes); if (strict_validation) { @@ -1676,7 +1678,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jbyte line_delimiter) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1700,6 +1703,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1814,6 +1818,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1848,6 +1853,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes); if (strict_validation) { @@ -1908,7 +1914,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jbyte line_delimiter) { bool read_buffer = true; if (buffer == 0) { @@ -1957,6 +1964,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes); if (strict_validation) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 830f2b33b32..c7fcb1756b6 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -40,7 +40,6 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.*; @@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() { } } + private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" + + "{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8); + + @Test + void testReadJSONDelim() { + Schema schema = Schema.builder().addColumn(DType.STRING, "a").build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withLineDelimiter('\0') + .build(); + try (Table expected = new Table.TestBuilder() + .column("12\n3", "AB\nC") + .build(); + Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) { + assertTablesAreEqual(expected, found); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + From b3518ab7e10f5eabf5ef06a495cc659079e0447c Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 24 Sep 2024 10:15:38 -0500 Subject: [PATCH 5/7] Add in option for Java JSON APIs to do column pruning in CUDF (#16796) This adds in the options to enable column_pruning when reading JSON using the java APIs. This is still in draft because there are test failures if this is turned on for those tests. https://github.com/rapidsai/cudf/issues/16797 That said the performance impact from enabling column pruning on some queries is huge. For one query in particular the current code takes 161.5 seconds and with CUDF column pruning it is just 16.5 seconds. That is a 10x speedup for something that is fairly real world. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Alessandro Bellina (https://github.com/abellina) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16796 --- .../main/java/ai/rapids/cudf/JSONOptions.java | 12 ++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 17 +++++++++++++++++ java/src/main/native/src/TableJni.cpp | 12 +++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 17b497be5ee..2bb74c3e3b1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowLeadingZeros; private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; + private final boolean cudfPruneSchema; private final byte lineDelimiter; private JSONOptions(Builder builder) { @@ -53,9 +54,14 @@ private JSONOptions(Builder builder) { allowLeadingZeros = builder.allowLeadingZeros; allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; + cudfPruneSchema = builder.cudfPruneSchema; lineDelimiter = builder.lineDelimiter; } + public boolean shouldCudfPruneSchema() { + return cudfPruneSchema; + } + public byte getLineDelimiter() { return lineDelimiter; } @@ -129,8 +135,14 @@ public static final class Builder extends ColumnFilterOptions.Builder Byte.MAX_VALUE) { throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 19c72809cea..6d370ca27b2 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -259,6 +259,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + boolean pruneColumns, byte lineDelimiter) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, @@ -273,6 +274,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + boolean pruneColumns, byte lineDelimiter, long dsHandle) throws CudfException; @@ -1312,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1326,6 +1332,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + cudfPruneSchema, opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, -1); @@ -1472,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1487,6 +1498,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + cudfPruneSchema, opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } @@ -1513,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1526,6 +1542,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + cudfPruneSchema, opts.getLineDelimiter(), dsHandle))) { return gatherJSONColumns(schema, twm, emptyRowCount); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 96d4c2c4eeb..0f77da54152 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1649,7 +1649,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .mixed_types_as_string(mixed_types_as_string) .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(false); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1703,6 +1704,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) + .prune_columns(false) .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { @@ -1818,6 +1820,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean prune_columns, jbyte line_delimiter, jlong ds_handle) { @@ -1855,7 +1858,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .mixed_types_as_string(mixed_types_as_string) .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1915,6 +1919,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean prune_columns, jbyte line_delimiter) { bool read_buffer = true; @@ -1966,7 +1971,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .mixed_types_as_string(mixed_types_as_string) .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) From f8db575330dddf5f32df049ec9928018697fdef3 Mon Sep 17 00:00:00 2001 From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com> Date: Tue, 24 Sep 2024 14:11:02 -0500 Subject: [PATCH 6/7] Update update-version.sh to use packaging lib (#16891) This PR updates the update-version.sh script to use the packaging library, given that setuptools is no longer included by default in Python 3.12. --- ci/release/update-version.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index b0346327319..f73e88bc0c8 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} # Need to distutils-normalize the versions for some use cases -CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))") -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") -PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))") +CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))") +NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))") +PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))") echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" From 73fa557186932fa867a0516f8947bb25b97d0f29 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 24 Sep 2024 18:43:02 -0500 Subject: [PATCH 7/7] Update oldest deps for `pyarrow` & `numpy` (#16883) We recently pinned our `dask-expr` version to `1.1.14`: https://github.com/rapidsai/rapids-dask-dependency/pull/64, that plus latest `dask` seems to be having a minimum requirement for `pyarrow` as `14.0.1`. This is causing failures in our CI matrix while running tests with the oldest dependencies. This PR bumps the minimum pyarrow version in our oldest deps. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16883 --- ci/cudf_pandas_scripts/run_tests.sh | 4 ++-- ci/test_python_common.sh | 4 ++-- ci/test_python_cudf.sh | 2 +- ci/test_python_other.sh | 2 +- dependencies.yaml | 36 +++++++++++++++++++++++++---- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index c6228a4ef33..f6bdc6f9484 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -56,10 +56,10 @@ else echo "" > ./constraints.txt if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]` + # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]` rapids-dependency-file-generator \ --output requirements \ - --file-key test_python \ + --file-key test_python_cudf_pandas \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee ./constraints.txt fi diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index d0675b0431a..dc70661a17a 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -10,10 +10,10 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" ENV_YAML_DIR="$(mktemp -d)" - +FILE_KEY=$1 rapids-dependency-file-generator \ --output conda \ - --file-key test_python \ + --file-key ${FILE_KEY} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee "${ENV_YAML_DIR}/env.yaml" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ae34047e87f..2386414b32e 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../; # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 06a24773cae..67c97ad29a5 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_other rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ diff --git a/dependencies.yaml b/dependencies.yaml index 01edcb3889a..7a9c9b8486d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -43,15 +43,28 @@ files: includes: - cuda_version - test_cpp - test_python: + test_python_cudf_pandas: output: none includes: - cuda_version - py_version - test_python_common - test_python_cudf - - test_python_dask_cudf - test_python_cudf_pandas + test_python_cudf: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_cudf + test_python_other: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_dask_cudf test_java: output: none includes: @@ -707,9 +720,7 @@ dependencies: - matrix: {dependencies: "oldest"} packages: - numba==0.57.* - - numpy==1.23.* - pandas==2.0.* - - pyarrow==14.0.0 - matrix: packages: - output_types: conda @@ -764,6 +775,14 @@ dependencies: - &transformers transformers==4.39.3 - tzdata specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.23.* + - pyarrow==14.0.0 + - matrix: + packages: - output_types: conda matrices: - matrix: @@ -783,6 +802,15 @@ dependencies: packages: - dask-cuda==24.10.*,>=0.0.0a0 - *numba + specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.24.* + - pyarrow==14.0.1 + - matrix: + packages: depends_on_libcudf: common: - output_types: conda