Merge branch 'emma/update_for_release_2404' into 'main'

Some updates for release 24.04 See merge request dl/hugectr/hugectr!1528
NVIDIA-Merlin · Apr 11, 2024 · 19e6017 · 19e6017
2 parents 5e37270 + 93eec06
commit 19e6017
Show file tree

Hide file tree

Showing 15 changed files with 30 additions and 25 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -177,7 +177,7 @@ build_inference:
 build_sok_tf2:
   extends: .build_sok
   variables:
-    FROM_IMAGE: ${IMAGE_SOK_TF2}
+    FROM_IMAGE: ${IMAGE_ALL}
     DST_IMAGE: $SOK_IMAGE_VERSIONED_TF2
     CMAKE_OPTION: "-DSM=\"60;61;70;75;80;90\""
     BUILD_SOK: 1
@@ -207,7 +207,7 @@ build_hugectr_hps_trt_plugin:
 build_tf_hps_trt_plugin:
   extends: .build_hugectr
   variables:
-    FROM_IMAGE: ${IMAGE_SOK_TF2}
+    FROM_IMAGE: ${IMAGE_ALL}
     DST_IMAGE: $TF_TRT_IMAGE_VERSIONED
     BUILD_TF_PLUGIN: 1
     BUILD_TRT_PLUGIN: 1
@@ -219,7 +219,7 @@ build_tf_hps_trt_plugin:
 build_pytorch_hps_trt_plugin:
   extends: .build_hugectr
   variables:
-    FROM_IMAGE: ${IMAGE_PYTORCH}
+    FROM_IMAGE: ${IMAGE_ALL}
     DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED
     BUILD_TORCH_PLUGIN: 1
     BUILD_TRT_PLUGIN: 1
@@ -445,7 +445,7 @@ test_sok_pypi:
     - build_sok_tf2
   variables:
     CONT: $SOK_IMAGE_VERSIONED_TF2
-    CI_SLURM_TIME: "00:15:00"
+    CI_SLURM_TIME: "00:30:00"
     TEST_CMD: ./ci/integration_test/sok/test_sok_pypi.sub
 
 wdl_check:

diff --git a/HugeCTR/src/hps/CMakeLists.txt b/HugeCTR/src/hps/CMakeLists.txt
@@ -30,6 +30,9 @@ list(APPEND huge_ctr_hps_src
   "../io/gcs_filesystem.cpp"
 )
 
+# this manual definition is a WAR and RMM team will fix it in the future
+add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+
 add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src})
 
 if(ENABLE_HDFS)

diff --git a/ci/integration_test/criteo/criteo_multi_node.sub b/ci/integration_test/criteo/criteo_multi_node.sub
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-      cd /dataset/criteo_kaggle/criteo_parquet && \
-      python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json"
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/criteo_parquet && \
+    python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json"
diff --git a/ci/integration_test/dcn/dcn_multi_node.sub b/ci/integration_test/dcn/dcn_multi_node.sub
@@ -1,11 +1,14 @@
 #!/bin/bash
 set -e
 
+#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
 srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-      cd /dataset/criteo_kaggle/dcn_parquet &&
-      python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/dcn_parquet && \
+    python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
 
 
 srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-      cd /dataset/criteo_kaggle/dcn_parquet &&
-      python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json"
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/dcn_parquet && \
+    python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json"
diff --git a/ci/integration_test/inference/inference_hps.sub b/ci/integration_test/inference/inference_hps.sub
@@ -3,6 +3,5 @@
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
       cd /dataset/criteo_kaggle/dcn && \
       python3 /workdir/test/inference/hps/lookup_session_test.py hps_lookup /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model  /hugectr/test/utest/wdl_test_files/first_ten.csv && \
-      pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \
-      pip install tensorflow && \
+      pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \
       python3 /workdir/test/inference/hps/hpsdlpack.py hpsdlpack /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model  /hugectr/test/utest/wdl_test_files/first_ten.csv"
diff --git a/ci/integration_test/notebooks/hps_demo.sub b/ci/integration_test/notebooks/hps_demo.sub
@@ -2,6 +2,5 @@
 
 srun --ntasks=1 --container-image="${CONT}" bash -cx " \
     chmod +x /usr/local/hugectr/bin/* && \
-    pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \
-    pip install tensorflow protobuf==3.20.3 && \
+    pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \
     cd /workdir/test/notebook_test && pytest hps_demo.py"
diff --git a/ci/integration_test/py_interface/py_multi_node.sub b/ci/integration_test/py_interface/py_multi_node.sub
@@ -1,10 +1,13 @@
 #!/bin/bash
 set -e
 
+#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
 srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-    cd /dataset/criteo_kaggle/dcn_parquet &&
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
+    cd /dataset/criteo_kaggle/dcn_parquet && \
     python3 /workdir/test/pybind_test/dcn_4node_2gpu.py /workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
 
 srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
+    export NCCL_IB_HCA=\"=mlx5_1\" && \
     cd /dataset/criteo_kaggle/criteo_parquet && \
     python3 /workdir/test/pybind_test/criteo_2node_4gpu.py /workdir/test/scripts/criteo_2node_4gpu.json"
diff --git a/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py b/docs/source/hierarchical_parameter_server/hps_dlrm_benchmark_scripts/create_trt_engines.py
@@ -47,7 +47,6 @@ def onnx_surgery(args):
 
 
 def create_hps_plugin_creator(args):
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     plugin_lib_name = args["hps_trt_plugin_lib_path"]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")

diff --git a/hps_tf/hps_cc/kernels/lookup_kernel.cpp b/hps_tf/hps_cc/kernels/lookup_kernel.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tensorflow/core/framework/op_kernel.h>
 #ifndef TF_GE_211
 #include <tensorflow/stream_executor/cuda/cuda_activation.h>
 #include <tensorflow/stream_executor/gpu/gpu_stream.h>
@@ -27,6 +26,8 @@
 #include <tensorflow/compiler/xla/stream_executor/stream_executor.h>
 #endif
 
+#include <tensorflow/core/framework/op_kernel.h>
+
 #include <hps/plugin/facade.hpp>
 
 namespace tensorflow {

diff --git a/hps_trt/test/integration/test_for_hugectr.py b/hps_trt/test/integration/test_for_hugectr.py
@@ -122,7 +122,6 @@ def build_engine_from_onnx(onnx_model_path):
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
     plg_registry = trt.get_plugin_registry()

diff --git a/hps_trt/test/integration/test_for_pytorch.py b/hps_trt/test/integration/test_for_pytorch.py
@@ -342,7 +342,6 @@ def build_engine_from_onnx(onnx_model_path):
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
     plg_registry = trt.get_plugin_registry()

diff --git a/hps_trt/test/integration/test_for_tf.py b/hps_trt/test/integration/test_for_tf.py
@@ -302,7 +302,6 @@ def build_engine_from_onnx(onnx_model_path):
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")
     plg_registry = trt.get_plugin_registry()

diff --git a/hps_trt/test/unit/test_hps.py b/hps_trt/test/unit/test_hps.py
@@ -120,7 +120,6 @@ def _generate_embedding_tables():
 
 
 def create_hps_plugin_creator():
-    trt_version = [int(n) for n in trt.__version__.split(".")]
     plugin_lib_name = PLUGIN_LIB_PATH
     handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
     trt.init_libnvinfer_plugins(TRT_LOGGER, "")

diff --git a/tools/dlrm_script/CMakeLists.txt b/tools/dlrm_script/CMakeLists.txt
@@ -43,8 +43,8 @@ if (NOT CUDF_RESULT)
     list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH)
     add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
     add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
-    if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6)
-       add_definitions(-DCUDF_GE_2306)
+    if(${CUDF_VERSION} VERSION_GREATER 23.06)
+        add_definitions(-DCUDF_GE_2306)
     endif()
 else()
     message(FATAL_ERROR "Can not detect cudf in your environment! ")

diff --git a/tools/dlrm_script/dlrm_raw.cu b/tools/dlrm_script/dlrm_raw.cu
@@ -141,7 +141,7 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
     cudf_io::table_with_metadata tbl_w_metadata =
         cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
-    auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
+    auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #else
     cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);
 #endif
@@ -501,7 +501,7 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
       cudf_io::table_with_metadata tbl_w_metadata =
           cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
-      auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
+      auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
 #else
       cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);