From 93eec064bc8664659a25df5c104ee32902c61021 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Thu, 11 Apr 2024 16:33:36 -0700 Subject: [PATCH] Some updates for release 24.04 --- .gitlab-ci.yml | 8 ++++---- HugeCTR/src/hps/CMakeLists.txt | 3 +++ ci/integration_test/criteo/criteo_multi_node.sub | 6 ++++-- ci/integration_test/dcn/dcn_multi_node.sub | 11 +++++++---- ci/integration_test/inference/inference_hps.sub | 3 +-- ci/integration_test/notebooks/hps_demo.sub | 3 +-- ci/integration_test/py_interface/py_multi_node.sub | 5 ++++- .../hps_dlrm_benchmark_scripts/create_trt_engines.py | 1 - hps_tf/hps_cc/kernels/lookup_kernel.cpp | 3 ++- hps_trt/test/integration/test_for_hugectr.py | 1 - hps_trt/test/integration/test_for_pytorch.py | 1 - hps_trt/test/integration/test_for_tf.py | 1 - hps_trt/test/unit/test_hps.py | 1 - tools/dlrm_script/CMakeLists.txt | 4 ++-- tools/dlrm_script/dlrm_raw.cu | 4 ++-- 15 files changed, 30 insertions(+), 25 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cf92cd73a5..7a74a45b65 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -177,7 +177,7 @@ build_inference: build_sok_tf2: extends: .build_sok variables: - FROM_IMAGE: ${IMAGE_SOK_TF2} + FROM_IMAGE: ${IMAGE_ALL} DST_IMAGE: $SOK_IMAGE_VERSIONED_TF2 CMAKE_OPTION: "-DSM=\"60;61;70;75;80;90\"" BUILD_SOK: 1 @@ -207,7 +207,7 @@ build_hugectr_hps_trt_plugin: build_tf_hps_trt_plugin: extends: .build_hugectr variables: - FROM_IMAGE: ${IMAGE_SOK_TF2} + FROM_IMAGE: ${IMAGE_ALL} DST_IMAGE: $TF_TRT_IMAGE_VERSIONED BUILD_TF_PLUGIN: 1 BUILD_TRT_PLUGIN: 1 @@ -219,7 +219,7 @@ build_tf_hps_trt_plugin: build_pytorch_hps_trt_plugin: extends: .build_hugectr variables: - FROM_IMAGE: ${IMAGE_PYTORCH} + FROM_IMAGE: ${IMAGE_ALL} DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED BUILD_TORCH_PLUGIN: 1 BUILD_TRT_PLUGIN: 1 @@ -445,7 +445,7 @@ test_sok_pypi: - build_sok_tf2 variables: CONT: $SOK_IMAGE_VERSIONED_TF2 - CI_SLURM_TIME: "00:15:00" + CI_SLURM_TIME: "00:30:00" TEST_CMD: ./ci/integration_test/sok/test_sok_pypi.sub wdl_check: diff --git a/HugeCTR/src/hps/CMakeLists.txt b/HugeCTR/src/hps/CMakeLists.txt index e648e1a826..bee38a1783 100644 --- a/HugeCTR/src/hps/CMakeLists.txt +++ b/HugeCTR/src/hps/CMakeLists.txt @@ -30,6 +30,9 @@ list(APPEND huge_ctr_hps_src "../io/gcs_filesystem.cpp" ) +# this manual definition is a WAR and RMM team will fix it in the future +add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) + add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src}) if(ENABLE_HDFS) diff --git a/ci/integration_test/criteo/criteo_multi_node.sub b/ci/integration_test/criteo/criteo_multi_node.sub index 700d2aa3be..9d55b18d17 100644 --- a/ci/integration_test/criteo/criteo_multi_node.sub +++ b/ci/integration_test/criteo/criteo_multi_node.sub @@ -1,5 +1,7 @@ #!/bin/bash +#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /dataset/criteo_kaggle/criteo_parquet && \ - python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json" \ No newline at end of file + export NCCL_IB_HCA=\"=mlx5_1\" && \ + cd /dataset/criteo_kaggle/criteo_parquet && \ + python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json" diff --git a/ci/integration_test/dcn/dcn_multi_node.sub b/ci/integration_test/dcn/dcn_multi_node.sub index 90e381dcda..863596f304 100644 --- a/ci/integration_test/dcn/dcn_multi_node.sub +++ b/ci/integration_test/dcn/dcn_multi_node.sub @@ -1,11 +1,14 @@ #!/bin/bash set -e +#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && - python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json" + export NCCL_IB_HCA=\"=mlx5_1\" && \ + cd /dataset/criteo_kaggle/dcn_parquet && \ + python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json" srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && - python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json" + export NCCL_IB_HCA=\"=mlx5_1\" && \ + cd /dataset/criteo_kaggle/dcn_parquet && \ + python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json" diff --git a/ci/integration_test/inference/inference_hps.sub b/ci/integration_test/inference/inference_hps.sub index 0face950ea..950f137499 100755 --- a/ci/integration_test/inference/inference_hps.sub +++ b/ci/integration_test/inference/inference_hps.sub @@ -3,6 +3,5 @@ srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ cd /dataset/criteo_kaggle/dcn && \ python3 /workdir/test/inference/hps/lookup_session_test.py hps_lookup /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv && \ - pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \ - pip install tensorflow && \ + pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \ python3 /workdir/test/inference/hps/hpsdlpack.py hpsdlpack /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv" diff --git a/ci/integration_test/notebooks/hps_demo.sub b/ci/integration_test/notebooks/hps_demo.sub index d58903a09e..39bb904e6e 100644 --- a/ci/integration_test/notebooks/hps_demo.sub +++ b/ci/integration_test/notebooks/hps_demo.sub @@ -2,6 +2,5 @@ srun --ntasks=1 --container-image="${CONT}" bash -cx " \ chmod +x /usr/local/hugectr/bin/* && \ - pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \ - pip install tensorflow protobuf==3.20.3 && \ + pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \ cd /workdir/test/notebook_test && pytest hps_demo.py" diff --git a/ci/integration_test/py_interface/py_multi_node.sub b/ci/integration_test/py_interface/py_multi_node.sub index a5799c525b..7f75184342 100644 --- a/ci/integration_test/py_interface/py_multi_node.sub +++ b/ci/integration_test/py_interface/py_multi_node.sub @@ -1,10 +1,13 @@ #!/bin/bash set -e +#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && + export NCCL_IB_HCA=\"=mlx5_1\" && \ + cd /dataset/criteo_kaggle/dcn_parquet && \ python3 /workdir/test/pybind_test/dcn_4node_2gpu.py /workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json" srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ + export NCCL_IB_HCA=\"=mlx5_1\" && \ cd /dataset/criteo_kaggle/criteo_parquet && \ python3 /workdir/test/pybind_test/criteo_2node_4gpu.py /workdir/test/scripts/criteo_2node_4gpu.json" diff --git a/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py b/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py index db4ae622db..4beea1c414 100644 --- a/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py +++ b/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py @@ -47,7 +47,6 @@ def onnx_surgery(args): def create_hps_plugin_creator(args): - trt_version = [int(n) for n in trt.__version__.split(".")] plugin_lib_name = args["hps_trt_plugin_lib_path"] handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL) trt.init_libnvinfer_plugins(TRT_LOGGER, "") diff --git a/hps_tf/hps_cc/kernels/lookup_kernel.cpp b/hps_tf/hps_cc/kernels/lookup_kernel.cpp index 4f33ecf6ca..15f028862b 100644 --- a/hps_tf/hps_cc/kernels/lookup_kernel.cpp +++ b/hps_tf/hps_cc/kernels/lookup_kernel.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #ifndef TF_GE_211 #include #include @@ -27,6 +26,8 @@ #include #endif +#include + #include namespace tensorflow { diff --git a/hps_trt/test/integration/test_for_hugectr.py b/hps_trt/test/integration/test_for_hugectr.py index 993ed94aff..08fe05f19b 100644 --- a/hps_trt/test/integration/test_for_hugectr.py +++ b/hps_trt/test/integration/test_for_hugectr.py @@ -122,7 +122,6 @@ def build_engine_from_onnx(onnx_model_path): def create_hps_plugin_creator(): - trt_version = [int(n) for n in trt.__version__.split(".")] handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL) trt.init_libnvinfer_plugins(TRT_LOGGER, "") plg_registry = trt.get_plugin_registry() diff --git a/hps_trt/test/integration/test_for_pytorch.py b/hps_trt/test/integration/test_for_pytorch.py index 519011fcae..a050e8d523 100644 --- a/hps_trt/test/integration/test_for_pytorch.py +++ b/hps_trt/test/integration/test_for_pytorch.py @@ -342,7 +342,6 @@ def build_engine_from_onnx(onnx_model_path): def create_hps_plugin_creator(): - trt_version = [int(n) for n in trt.__version__.split(".")] handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL) trt.init_libnvinfer_plugins(TRT_LOGGER, "") plg_registry = trt.get_plugin_registry() diff --git a/hps_trt/test/integration/test_for_tf.py b/hps_trt/test/integration/test_for_tf.py index b355aa3b39..a27299dd03 100644 --- a/hps_trt/test/integration/test_for_tf.py +++ b/hps_trt/test/integration/test_for_tf.py @@ -302,7 +302,6 @@ def build_engine_from_onnx(onnx_model_path): def create_hps_plugin_creator(): - trt_version = [int(n) for n in trt.__version__.split(".")] handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL) trt.init_libnvinfer_plugins(TRT_LOGGER, "") plg_registry = trt.get_plugin_registry() diff --git a/hps_trt/test/unit/test_hps.py b/hps_trt/test/unit/test_hps.py index d68a4cd060..cba581a30b 100644 --- a/hps_trt/test/unit/test_hps.py +++ b/hps_trt/test/unit/test_hps.py @@ -120,7 +120,6 @@ def _generate_embedding_tables(): def create_hps_plugin_creator(): - trt_version = [int(n) for n in trt.__version__.split(".")] plugin_lib_name = PLUGIN_LIB_PATH handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL) trt.init_libnvinfer_plugins(TRT_LOGGER, "") diff --git a/tools/dlrm_script/CMakeLists.txt b/tools/dlrm_script/CMakeLists.txt index 0bbcc07cfb..5849658b68 100644 --- a/tools/dlrm_script/CMakeLists.txt +++ b/tools/dlrm_script/CMakeLists.txt @@ -43,8 +43,8 @@ if (NOT CUDF_RESULT) list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH) add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR}) add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR}) - if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6) - add_definitions(-DCUDF_GE_2306) + if(${CUDF_VERSION} VERSION_GREATER 23.06) + add_definitions(-DCUDF_GE_2306) endif() else() message(FATAL_ERROR "Can not detect cudf in your environment! ") diff --git a/tools/dlrm_script/dlrm_raw.cu b/tools/dlrm_script/dlrm_raw.cu index 56b7aa9355..1a457098fe 100644 --- a/tools/dlrm_script/dlrm_raw.cu +++ b/tools/dlrm_script/dlrm_raw.cu @@ -141,7 +141,7 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr); #elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23 - auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr); + auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr); #else cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr); #endif @@ -501,7 +501,7 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr); #elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23 - auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr); + auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr); #else cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);