Skip to content

Commit

Permalink
Merge branch 'emma/update_for_release_2404' into 'main'
Browse files Browse the repository at this point in the history
Some updates for release 24.04

See merge request dl/hugectr/hugectr!1528
  • Loading branch information
minseokl committed Apr 11, 2024
2 parents 5e37270 + 93eec06 commit 19e6017
Show file tree
Hide file tree
Showing 15 changed files with 30 additions and 25 deletions.
8 changes: 4 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ build_inference:
build_sok_tf2:
extends: .build_sok
variables:
FROM_IMAGE: ${IMAGE_SOK_TF2}
FROM_IMAGE: ${IMAGE_ALL}
DST_IMAGE: $SOK_IMAGE_VERSIONED_TF2
CMAKE_OPTION: "-DSM=\"60;61;70;75;80;90\""
BUILD_SOK: 1
Expand Down Expand Up @@ -207,7 +207,7 @@ build_hugectr_hps_trt_plugin:
build_tf_hps_trt_plugin:
extends: .build_hugectr
variables:
FROM_IMAGE: ${IMAGE_SOK_TF2}
FROM_IMAGE: ${IMAGE_ALL}
DST_IMAGE: $TF_TRT_IMAGE_VERSIONED
BUILD_TF_PLUGIN: 1
BUILD_TRT_PLUGIN: 1
Expand All @@ -219,7 +219,7 @@ build_tf_hps_trt_plugin:
build_pytorch_hps_trt_plugin:
extends: .build_hugectr
variables:
FROM_IMAGE: ${IMAGE_PYTORCH}
FROM_IMAGE: ${IMAGE_ALL}
DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED
BUILD_TORCH_PLUGIN: 1
BUILD_TRT_PLUGIN: 1
Expand Down Expand Up @@ -445,7 +445,7 @@ test_sok_pypi:
- build_sok_tf2
variables:
CONT: $SOK_IMAGE_VERSIONED_TF2
CI_SLURM_TIME: "00:15:00"
CI_SLURM_TIME: "00:30:00"
TEST_CMD: ./ci/integration_test/sok/test_sok_pypi.sub

wdl_check:
Expand Down
3 changes: 3 additions & 0 deletions HugeCTR/src/hps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ list(APPEND huge_ctr_hps_src
"../io/gcs_filesystem.cpp"
)

# this manual definition is a WAR and RMM team will fix it in the future
add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)

add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src})

if(ENABLE_HDFS)
Expand Down
6 changes: 4 additions & 2 deletions ci/integration_test/criteo/criteo_multi_node.sub
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
cd /dataset/criteo_kaggle/criteo_parquet && \
python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json"
export NCCL_IB_HCA=\"=mlx5_1\" && \
cd /dataset/criteo_kaggle/criteo_parquet && \
python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/criteo_2node_4gpu.json"
11 changes: 7 additions & 4 deletions ci/integration_test/dcn/dcn_multi_node.sub
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#!/bin/bash
set -e

#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
cd /dataset/criteo_kaggle/dcn_parquet &&
python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"
export NCCL_IB_HCA=\"=mlx5_1\" && \
cd /dataset/criteo_kaggle/dcn_parquet && \
python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"


srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
cd /dataset/criteo_kaggle/dcn_parquet &&
python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json"
export NCCL_IB_HCA=\"=mlx5_1\" && \
cd /dataset/criteo_kaggle/dcn_parquet && \
python3 /workdir/test/pybind_test/multi_node_test.py --json-file=/workdir/test/scripts/dcn_parquet_localized_2node_4gpu.json"
3 changes: 1 addition & 2 deletions ci/integration_test/inference/inference_hps.sub
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
cd /dataset/criteo_kaggle/dcn && \
python3 /workdir/test/inference/hps/lookup_session_test.py hps_lookup /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv && \
pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \
pip install tensorflow && \
pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \
python3 /workdir/test/inference/hps/hpsdlpack.py hpsdlpack /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv"
3 changes: 1 addition & 2 deletions ci/integration_test/notebooks/hps_demo.sub
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@

srun --ntasks=1 --container-image="${CONT}" bash -cx " \
chmod +x /usr/local/hugectr/bin/* && \
pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 && \
pip install tensorflow protobuf==3.20.3 && \
pip install torchvision==0.17.2+cu121 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121 && \
cd /workdir/test/notebook_test && pytest hps_demo.py"
5 changes: 4 additions & 1 deletion ci/integration_test/py_interface/py_multi_node.sub
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!/bin/bash
set -e

#TODO: add restriction of NCCL_IB_HCA for draco-oci, may need remove it for other cluster
srun --ntasks=4 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
cd /dataset/criteo_kaggle/dcn_parquet &&
export NCCL_IB_HCA=\"=mlx5_1\" && \
cd /dataset/criteo_kaggle/dcn_parquet && \
python3 /workdir/test/pybind_test/dcn_4node_2gpu.py /workdir/test/scripts/dcn_parquet_distributed_4node_2gpu.json"

srun --ntasks=2 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
export NCCL_IB_HCA=\"=mlx5_1\" && \
cd /dataset/criteo_kaggle/criteo_parquet && \
python3 /workdir/test/pybind_test/criteo_2node_4gpu.py /workdir/test/scripts/criteo_2node_4gpu.json"
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def onnx_surgery(args):


def create_hps_plugin_creator(args):
trt_version = [int(n) for n in trt.__version__.split(".")]
plugin_lib_name = args["hps_trt_plugin_lib_path"]
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
Expand Down
3 changes: 2 additions & 1 deletion hps_tf/hps_cc/kernels/lookup_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
* limitations under the License.
*/

#include <tensorflow/core/framework/op_kernel.h>
#ifndef TF_GE_211
#include <tensorflow/stream_executor/cuda/cuda_activation.h>
#include <tensorflow/stream_executor/gpu/gpu_stream.h>
Expand All @@ -27,6 +26,8 @@
#include <tensorflow/compiler/xla/stream_executor/stream_executor.h>
#endif

#include <tensorflow/core/framework/op_kernel.h>

#include <hps/plugin/facade.hpp>

namespace tensorflow {
Expand Down
1 change: 0 additions & 1 deletion hps_trt/test/integration/test_for_hugectr.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ def build_engine_from_onnx(onnx_model_path):


def create_hps_plugin_creator():
trt_version = [int(n) for n in trt.__version__.split(".")]
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
plg_registry = trt.get_plugin_registry()
Expand Down
1 change: 0 additions & 1 deletion hps_trt/test/integration/test_for_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,6 @@ def build_engine_from_onnx(onnx_model_path):


def create_hps_plugin_creator():
trt_version = [int(n) for n in trt.__version__.split(".")]
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
plg_registry = trt.get_plugin_registry()
Expand Down
1 change: 0 additions & 1 deletion hps_trt/test/integration/test_for_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,6 @@ def build_engine_from_onnx(onnx_model_path):


def create_hps_plugin_creator():
trt_version = [int(n) for n in trt.__version__.split(".")]
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
plg_registry = trt.get_plugin_registry()
Expand Down
1 change: 0 additions & 1 deletion hps_trt/test/unit/test_hps.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def _generate_embedding_tables():


def create_hps_plugin_creator():
trt_version = [int(n) for n in trt.__version__.split(".")]
plugin_lib_name = PLUGIN_LIB_PATH
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
Expand Down
4 changes: 2 additions & 2 deletions tools/dlrm_script/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ if (NOT CUDF_RESULT)
list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH)
add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6)
add_definitions(-DCUDF_GE_2306)
if(${CUDF_VERSION} VERSION_GREATER 23.06)
add_definitions(-DCUDF_GE_2306)
endif()
else()
message(FATAL_ERROR "Can not detect cudf in your environment! ")
Expand Down
4 changes: 2 additions & 2 deletions tools/dlrm_script/dlrm_raw.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
cudf_io::table_with_metadata tbl_w_metadata =
cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
#else
cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);
#endif
Expand Down Expand Up @@ -501,7 +501,7 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
cudf_io::table_with_metadata tbl_w_metadata =
cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
#else
cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);

Expand Down

0 comments on commit 19e6017

Please sign in to comment.