From 93eec064bc8664659a25df5c104ee32902c61021 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Thu, 11 Apr 2024 16:33:36 -0700
Subject: [PATCH] Some updates for release 24.04

---
 .gitlab-ci.yml                                        |  8 ++++----
 HugeCTR/src/hps/CMakeLists.txt                        |  3 +++
 ci/integration_test/criteo/criteo_multi_node.sub      |  6 ++++--
 ci/integration_test/dcn/dcn_multi_node.sub            | 11 +++++++----
 ci/integration_test/inference/inference_hps.sub       |  3 +--
 ci/integration_test/notebooks/hps_demo.sub            |  3 +--
 ci/integration_test/py_interface/py_multi_node.sub    |  5 ++++-
 .../hps_dlrm_benchmark_scripts/create_trt_engines.py  |  1 -
 hps_tf/hps_cc/kernels/lookup_kernel.cpp               |  3 ++-
 hps_trt/test/integration/test_for_hugectr.py          |  1 -
 hps_trt/test/integration/test_for_pytorch.py          |  1 -
 hps_trt/test/integration/test_for_tf.py               |  1 -
 hps_trt/test/unit/test_hps.py                         |  1 -
 tools/dlrm_script/CMakeLists.txt                      |  4 ++--
 tools/dlrm_script/dlrm_raw.cu                         |  4 ++--
 15 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cf92cd73a5..7a74a45b65 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -177,7 +177,7 @@ build_inference:
 build_sok_tf2:
   extends: .build_sok
   variables:
-    FROM_IMAGE: ${IMAGE_SOK_TF2}
+    FROM_IMAGE: ${IMAGE_ALL}
     DST_IMAGE: $SOK_IMAGE_VERSIONED_TF2
     CMAKE_OPTION: "-DSM=\"60;61;70;75;80;90\""
     BUILD_SOK: 1
@@ -207,7 +207,7 @@ build_hugectr_hps_trt_plugin:
 build_tf_hps_trt_plugin:
   extends: .build_hugectr
   variables:
-    FROM_IMAGE: ${IMAGE_SOK_TF2}
+    FROM_IMAGE: ${IMAGE_ALL}
     DST_IMAGE: $TF_TRT_IMAGE_VERSIONED
     BUILD_TF_PLUGIN: 1
     BUILD_TRT_PLUGIN: 1
@@ -219,7 +219,7 @@ build_tf_hps_trt_plugin:
 build_pytorch_hps_trt_plugin:
   extends: .build_hugectr
   variables:
-    FROM_IMAGE: ${IMAGE_PYTORCH}
+    FROM_IMAGE: ${IMAGE_ALL}
     DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED
     BUILD_TORCH_PLUGIN: 1
     BUILD_TRT_PLUGIN: 1
@@ -445,7 +445,7 @@ test_sok_pypi:
     - build_sok_tf2
   variables:
     CONT: $SOK_IMAGE_VERSIONED_TF2
-    CI_SLURM_TIME: "00:15:00"
+    CI_SLURM_TIME: "00:30:00"
     TEST_CMD: ./ci/integration_test/sok/test_sok_pypi.sub
 
 wdl_check:
diff --git a/HugeCTR/src/hps/CMakeLists.txt b/HugeCTR/src/hps/CMakeLists.txt
index e648e1a826..bee38a1783 100644
--- a/HugeCTR/src/hps/CMakeLists.txt
+++ b/HugeCTR/src/hps/CMakeLists.txt
@@ -30,6 +30,9 @@ list(APPEND huge_ctr_hps_src
   "../io/gcs_filesystem.cpp"
 )
 
+# this manual definition is a WAR and RMM team will fix it in the future
+add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+
 add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src})
 
 if(ENABLE_HDFS)
diff --git a/ci/integration_test/criteo/criteo_multi_node.sub b/ci/integration_test/criteo/criteo_multi_node.sub
index 700d2aa3be..9d55b18d17 100644
--- a/ci/integration_test/criteo/criteo_multi_node.sub
+++ b/ci/integration_test/criteo/criteo_multi_node.sub
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-      cd /dataset/criteo_kaggle/criteo_parquet && \
-      python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json"
\ No newline at end of file
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/criteo_parquet && \
+    python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json"
diff --git a/ci/integration_test/dcn/dcn_multi_node.sub b/ci/integration_test/dcn/dcn_multi_node.sub
index 90e381dcda..863596f304 100644
--- a/ci/integration_test/dcn/dcn_multi_node.sub
+++ b/ci/integration_test/dcn/dcn_multi_node.sub
@@ -1,11 +1,14 @@
 #!/bin/bash
 set -e
 
+#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
 srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-      cd /dataset/criteo_kaggle/dcn_parquet &&
-      python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/dcn_parquet && \
+    python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
 
 
 srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-      cd /dataset/criteo_kaggle/dcn_parquet &&
-      python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json"
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/dcn_parquet && \
+    python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json"
diff --git a/ci/integration_test/inference/inference_hps.sub b/ci/integration_test/inference/inference_hps.sub
index 0face950ea..950f137499 100755
--- a/ci/integration_test/inference/inference_hps.sub
+++ b/ci/integration_test/inference/inference_hps.sub
@@ -3,6 +3,5 @@
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
       cd /dataset/criteo_kaggle/dcn && \
       python3 /workdir/test/inference/hps/lookup_session_test.py hps_lookup /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model  /hugectr/test/utest/wdl_test_files/first_ten.csv && \
-      pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \
-      pip install tensorflow && \
+      pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \
       python3 /workdir/test/inference/hps/hpsdlpack.py hpsdlpack /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model  /hugectr/test/utest/wdl_test_files/first_ten.csv"
diff --git a/ci/integration_test/notebooks/hps_demo.sub b/ci/integration_test/notebooks/hps_demo.sub
index d58903a09e..39bb904e6e 100644
--- a/ci/integration_test/notebooks/hps_demo.sub
+++ b/ci/integration_test/notebooks/hps_demo.sub
@@ -2,6 +2,5 @@
 
 srun --ntasks=1 --container-image="${CONT}" bash -cx " \
     chmod +x /usr/local/hugectr/bin/* && \
-    pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \
-    pip install tensorflow protobuf==3.20.3 && \
+    pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \
     cd /workdir/test/notebook_test && pytest hps_demo.py"
diff --git a/ci/integration_test/py_interface/py_multi_node.sub b/ci/integration_test/py_interface/py_multi_node.sub
index a5799c525b..7f75184342 100644
--- a/ci/integration_test/py_interface/py_multi_node.sub
+++ b/ci/integration_test/py_interface/py_multi_node.sub
@@ -1,10 +1,13 @@
 #!/bin/bash
 set -e
 
+#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
 srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-    cd /dataset/criteo_kaggle/dcn_parquet &&
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/dcn_parquet && \
     python3 /workdir/test/pybind_test/dcn_4node_2gpu.py /workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
 
 srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
     cd /dataset/criteo_kaggle/criteo_parquet && \
     python3 /workdir/test/pybind_test/criteo_2node_4gpu.py /workdir/test/scripts/criteo_2node_4gpu.json"
diff --git a/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py b/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py
index db4ae622db..4beea1c414 100644
--- a/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py
+++ b/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py
@@ -47,7 +47,6 @@ def onnx_surgery(args):
 
 
 def create_hps_plugin_creator(args):
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     plugin_lib_name = args["hps_trt_plugin_lib_path"]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
diff --git a/hps_tf/hps_cc/kernels/lookup_kernel.cpp b/hps_tf/hps_cc/kernels/lookup_kernel.cpp
index 4f33ecf6ca..15f028862b 100644
--- a/hps_tf/hps_cc/kernels/lookup_kernel.cpp
+++ b/hps_tf/hps_cc/kernels/lookup_kernel.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tensorflow/core/framework/op_kernel.h>
 #ifndef TF_GE_211
 #include <tensorflow/stream_executor/cuda/cuda_activation.h>
 #include <tensorflow/stream_executor/gpu/gpu_stream.h>
@@ -27,6 +26,8 @@
 #include <tensorflow/compiler/xla/stream_executor/stream_executor.h>
 #endif
 
+#include <tensorflow/core/framework/op_kernel.h>
+
 #include <hps/plugin/facade.hpp>
 
 namespace tensorflow {
diff --git a/hps_trt/test/integration/test_for_hugectr.py b/hps_trt/test/integration/test_for_hugectr.py
index 993ed94aff..08fe05f19b 100644
--- a/hps_trt/test/integration/test_for_hugectr.py
+++ b/hps_trt/test/integration/test_for_hugectr.py
@@ -122,7 +122,6 @@ def build_engine_from_onnx(onnx_model_path):
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
     plg_registry = trt.get_plugin_registry()
diff --git a/hps_trt/test/integration/test_for_pytorch.py b/hps_trt/test/integration/test_for_pytorch.py
index 519011fcae..a050e8d523 100644
--- a/hps_trt/test/integration/test_for_pytorch.py
+++ b/hps_trt/test/integration/test_for_pytorch.py
@@ -342,7 +342,6 @@ def build_engine_from_onnx(onnx_model_path):
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
     plg_registry = trt.get_plugin_registry()
diff --git a/hps_trt/test/integration/test_for_tf.py b/hps_trt/test/integration/test_for_tf.py
index b355aa3b39..a27299dd03 100644
--- a/hps_trt/test/integration/test_for_tf.py
+++ b/hps_trt/test/integration/test_for_tf.py
@@ -302,7 +302,6 @@ def build_engine_from_onnx(onnx_model_path):
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
     plg_registry = trt.get_plugin_registry()
diff --git a/hps_trt/test/unit/test_hps.py b/hps_trt/test/unit/test_hps.py
index d68a4cd060..cba581a30b 100644
--- a/hps_trt/test/unit/test_hps.py
+++ b/hps_trt/test/unit/test_hps.py
@@ -120,7 +120,6 @@ def _generate_embedding_tables():
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     plugin_lib_name = PLUGIN_LIB_PATH
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
diff --git a/tools/dlrm_script/CMakeLists.txt b/tools/dlrm_script/CMakeLists.txt
index 0bbcc07cfb..5849658b68 100644
--- a/tools/dlrm_script/CMakeLists.txt
+++ b/tools/dlrm_script/CMakeLists.txt
@@ -43,8 +43,8 @@ if (NOT CUDF_RESULT)
     list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH)
     add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
     add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
-    if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6)
-       add_definitions(-DCUDF_GE_2306)
+    if(${CUDF_VERSION} VERSION_GREATER 23.06)
+        add_definitions(-DCUDF_GE_2306)
     endif()
 else()
     message(FATAL_ERROR "Can not detect cudf in your environment! ")
diff --git a/tools/dlrm_script/dlrm_raw.cu b/tools/dlrm_script/dlrm_raw.cu
index 56b7aa9355..1a457098fe 100644
--- a/tools/dlrm_script/dlrm_raw.cu
+++ b/tools/dlrm_script/dlrm_raw.cu
@@ -141,7 +141,7 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
     cudf_io::table_with_metadata tbl_w_metadata =
         cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
-    auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
+    auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #else
     cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);
 #endif
@@ -501,7 +501,7 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
       cudf_io::table_with_metadata tbl_w_metadata =
           cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
-      auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
+      auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #else
       cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);