From 5ce1081420d2718e8381dccf75da694909aa2567 Mon Sep 17 00:00:00 2001 From: Igor Gambarin Date: Fri, 20 Dec 2024 00:54:04 +0200 Subject: [PATCH] SynapseAi 1.19.0 release * Update dockerfiles with 1.19.0 content --- dockerfiles/base/Dockerfile.amzn2 | 105 ------------------ dockerfiles/base/Dockerfile.rhel8.6 | 19 ++-- dockerfiles/base/Dockerfile.rhel9.2 | 5 +- dockerfiles/base/Dockerfile.rhel9.4 | 33 +++--- dockerfiles/base/Dockerfile.suse15.5 | 21 ++-- dockerfiles/base/Dockerfile.tencentos3.1 | 3 +- dockerfiles/base/Dockerfile.ubuntu22.04 | 3 +- dockerfiles/base/Dockerfile.ubuntu24.04 | 3 +- dockerfiles/base/install-python310.sh | 29 +---- dockerfiles/common.mk | 6 +- dockerfiles/pytorch/Dockerfile.amzn2 | 79 ------------- dockerfiles/pytorch/Dockerfile.rhel8.6 | 12 +- dockerfiles/pytorch/Dockerfile.rhel9.2 | 32 +++--- dockerfiles/pytorch/Dockerfile.rhel9.4 | 42 +++---- dockerfiles/pytorch/Dockerfile.suse15.5 | 20 +--- dockerfiles/pytorch/Dockerfile.tencentos3.1 | 30 +++-- dockerfiles/pytorch/Dockerfile.ubuntu | 33 +++--- dockerfiles/pytorch/install_packages.sh | 3 - utils/check_framework_env.py | 30 +++-- utils/intel_gaudi_health_screen/IGNodes.py | 1 + utils/intel_gaudi_health_screen/README.md | 4 +- utils/intel_gaudi_health_screen/config.yaml | 2 +- .../intel_gaudi_health_screen/system_utils.py | 14 ++- .../k8s/intel-gaudi-health-screen-L1.yaml | 4 +- ...ntel-gaudi-health-screen-L2_hccl-demo.yaml | 4 +- utils/intel_gaudi_health_screen/utilities.py | 2 +- 26 files changed, 155 insertions(+), 384 deletions(-) delete mode 100644 dockerfiles/base/Dockerfile.amzn2 delete mode 100644 dockerfiles/pytorch/Dockerfile.amzn2 diff --git a/dockerfiles/base/Dockerfile.amzn2 b/dockerfiles/base/Dockerfile.amzn2 deleted file mode 100644 index dfc548d..0000000 --- a/dockerfiles/base/Dockerfile.amzn2 +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2023 Habana Labs, Ltd. -# -# SPDX-License-Identifier: Apache-2.0 -# -# HabanaLabs Dockerfile base installer layer for Amazon Linux 2 -FROM amazonlinux:2 -ARG ARTIFACTORY_URL -ARG VERSION -ARG REVISION - -RUN amazon-linux-extras enable python3.8 && \ - yum update -y && yum install -y \ - git \ - unzip \ - ethtool-4.8-10.amzn2.x86_64 \ - openssh-clients \ - openssh-server \ - bzip2-devel \ - python38 \ - python38-devel \ - python38-pip \ - python38-tkinter \ - which \ - wget \ - lsof \ - tar \ - mesa-libGL \ - sox-devel && \ - yum clean all && rm -rf /var/cache/yum && \ - rm -f /etc/ssh/ssh_host_*_key* - -# Install jemalloc-3.6.0-1.el7.x86_64 package with required /lib64/libjemalloc.so.1 lib need for topologies -RUN yum install -y https://archives.fedoraproject.org/pub/archive/epel/7/x86_64/Packages/e/epel-release-7-14.noarch.rpm && \ - yum install -y jemalloc && \ - yum clean all && rm -rf /var/cache/yum - -# Install development tools and cmake for habana-horovod compilation sdist package -RUN yum groupinstall -y "Development Tools" -RUN yum install -y sudo system-lsb-core cmake - -COPY install_efa.sh . -RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh - -ENV LIBFABRIC_VERSION="1.22.0" -ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" -ENV MPI_ROOT=/opt/amazon/openmpi -ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH -ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH -ENV OPAL_PREFIX=${MPI_ROOT} -ENV MPICC=${MPI_ROOT}/bin/mpicc -ENV FI_EFA_FORK_SAFE=1 -ENV RDMAV_FORK_SAFE=1 -ENV FI_EFA_USE_DEVICE_RDMA=1 -ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src -ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib - -RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ - echo "name=Habana AWS Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo - -RUN yum makecache && \ - yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \ - yum install -y habanalabs-thunk-"$VERSION"-"$REVISION".amzn2 && \ - yum install -y habanalabs-firmware-tools-"$VERSION"-"$REVISION".amzn2 && \ - yum install -y habanalabs-graph-"$VERSION"-"$REVISION".amzn2 && \ - rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph && \ - rm -f /etc/yum.repos.d/habanalabs.repo && \ - yum clean all && rm -rf /var/cache/yum - -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 - -# SSH configuration necessary to support mpi-operator v2 -RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ - sed -i 's/[ #]\(.*ForwardAgent \).*/ \1yes/g' /etc/ssh/ssh_config && \ - echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ - sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc - -# There is no need to store pip installation files inside docker image -ENV PIP_NO_CACHE_DIR=on -ENV PIP_DISABLE_PIP_VERSION_CHECK=1 - -RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ - cd /tmp/ && tar xf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ - cd /tmp/libfabric-${LIBFABRIC_VERSION} && \ - ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ - make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} - -RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ - unzip /tmp/main.zip -d /tmp && \ - cd /tmp/hccl_ofi_wrapper-main && \ - make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ - cd / && \ - rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main - -RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 - -RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" - -ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so -ENV HABANA_LOGS=/var/log/habana_logs/ -ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw -ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins \ No newline at end of file diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel8.6 index 2e836c7..2192035 100644 --- a/dockerfiles/base/Dockerfile.rhel8.6 +++ b/dockerfiles/base/Dockerfile.rhel8.6 @@ -37,7 +37,8 @@ RUN dnf install -y \ unzip \ llvm \ lsof \ - python38-devel \ + python3.11-devel \ + python3.11-pip \ bzip2 \ bzip2-devel \ openssh-clients \ @@ -93,6 +94,11 @@ RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \ echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/powertools.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/powertools.repo +ENV PYTHON_VERSION=3.11 +RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ + alternatives --set python3 /usr/bin/python3.11 + RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \ habanalabs-thunk-"$VERSION"-"$REVISION".el8 \ habanalabs-firmware-tools-"$VERSION"-"$REVISION".el8 \ @@ -127,14 +133,9 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -ENV PYTHON_VERSION=3.8 -RUN python3.8 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 - -RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ - alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ - alternatives --set python3 /usr/bin/python3.8 +RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 -RUN python3.8 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" +RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" # SSH configuration necessary to support mpi-operator v2 RUN mkdir -p /var/run/sshd && \ @@ -147,4 +148,4 @@ RUN mkdir -p /var/run/sshd && \ ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so ENV HABANA_LOGS=/var/log/habana_logs/ ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw -ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins +ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins \ No newline at end of file diff --git a/dockerfiles/base/Dockerfile.rhel9.2 b/dockerfiles/base/Dockerfile.rhel9.2 index 4900b21..273fb7f 100644 --- a/dockerfiles/base/Dockerfile.rhel9.2 +++ b/dockerfiles/base/Dockerfile.rhel9.2 @@ -68,6 +68,7 @@ RUN dnf install -y \ ENV PYTHON_VERSION=3.10 COPY install-python310.sh . RUN ./install-python310.sh rhel9.2 && rm install-python310.sh +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH COPY install_efa.sh . @@ -128,11 +129,11 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3.10 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 +RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN ln -s /usr/bin/python3 /usr/bin/python -RUN python3.10 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" +RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" # SSH configuration necessary to support mpi-operator v2 RUN mkdir -p /var/run/sshd && \ diff --git a/dockerfiles/base/Dockerfile.rhel9.4 b/dockerfiles/base/Dockerfile.rhel9.4 index a00aa3a..654def2 100644 --- a/dockerfiles/base/Dockerfile.rhel9.4 +++ b/dockerfiles/base/Dockerfile.rhel9.4 @@ -31,10 +31,15 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo -RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ - dnf clean all && rm -rf /var/cache/yum - RUN dnf install -y \ + python3-dnf-plugin-versionlock && \ + dnf versionlock add redhat-release* && \ + dnf clean all + +RUN dnf update -y && dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf clean all + +RUN dnf update -y && dnf install -y \ clang \ cmake3 \ cpp \ @@ -51,8 +56,8 @@ RUN dnf install -y \ lsof \ python3-devel \ openssh-clients \ - openssl-1:3.0.7-28.el9_4 \ - openssl-devel-1:3.0.7-28.el9_4 \ + openssl \ + openssl-devel \ libjpeg-devel \ openssh-server \ lsb_release \ @@ -68,12 +73,9 @@ RUN dnf install -y \ python3.11-pip \ python3.11-devel \ python3.11-rpm \ - ffmpeg-free \ - python3-dnf-plugin-versionlock && \ - # update pkgs (except OS version) for resolving potentials CVEs - dnf versionlock add redhat-release* openssl* libcurl-minimal curl-minimal ima-evm-utils python3-rpm rpm* && \ - dnf update -y && \ - dnf clean all && rm -rf /var/cache/yum && \ + ffmpeg-free && \ + dnf versionlock add python3-rpm rpm* && \ + dnf clean all && \ rm -f /etc/ssh/ssh_host_*_key* RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ @@ -111,8 +113,9 @@ RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ - rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \ - dnf clean all && rm -rf /var/cache/yum + chmod +t /var/log/habana_logs && \ + rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && \ + dnf clean all RUN rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph @@ -141,11 +144,11 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 +RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN ln -s /usr/bin/python3 /usr/bin/python -RUN python3.11 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" +RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" # SSH configuration necessary to support mpi-operator v2 RUN mkdir -p /var/run/sshd && \ diff --git a/dockerfiles/base/Dockerfile.suse15.5 b/dockerfiles/base/Dockerfile.suse15.5 index f53bd3c..8def199 100644 --- a/dockerfiles/base/Dockerfile.suse15.5 +++ b/dockerfiles/base/Dockerfile.suse15.5 @@ -44,7 +44,8 @@ RUN zypper install -y --allow-downgrade \ Mesa-libGL1 \ openssh-clients \ openssh-server \ - openssl openssl-devel \ + openssl \ + openssl-devel \ python311 \ python311-devel \ python311-pip \ @@ -53,11 +54,14 @@ RUN zypper install -y --allow-downgrade \ zlib-devel && \ rm -f /etc/ssh/ssh_host_*_key* +ENV PIP_NO_CACHE_DIR=on +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 + RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ - alternatives --set python3 /usr/bin/python3.11 && \ - alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 1 && \ - alternatives --set pip3 /usr/bin/pip3.11 + alternatives --set python3 /usr/bin/python3.11 + +RUN python3 -m pip install setuptools==75.1.0 wheel==0.44.0 COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh @@ -82,10 +86,8 @@ RUN zypper --gpg-auto-import-keys install -y habanalabs-rdma-core-"$VERSION"-"$R habanalabs-thunk-"$VERSION"-"$REVISION" \ habanalabs-firmware-tools-"$VERSION"-"$REVISION" \ habanalabs-graph-"$VERSION"-"$REVISION" && \ - rm -f /etc/yum.repos.d/habanalabs.repo + rm -f /etc/zypp/repos.d/habanalabs.repo -ENV PIP_NO_CACHE_DIR=on -ENV PIP_DISABLE_PIP_VERSION_CHECK=1 ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib @@ -102,10 +104,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 - - -RUN python3.11 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" +RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" # SSH configuration necessary to support mpi-operator v2 RUN mkdir -p /var/run/sshd && \ diff --git a/dockerfiles/base/Dockerfile.tencentos3.1 b/dockerfiles/base/Dockerfile.tencentos3.1 index c5a28a7..2743b9a 100644 --- a/dockerfiles/base/Dockerfile.tencentos3.1 +++ b/dockerfiles/base/Dockerfile.tencentos3.1 @@ -47,13 +47,14 @@ RUN dnf install -y \ COPY install-python310.sh . RUN ./install-python310.sh tencentos3.1 && rm install-python310.sh +RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH COPY install_efa.sh . COPY tencentos_efa_patch.txt /tmp/tencentos_efa_patch.txt RUN ./install_efa.sh && rm -f install_efa.sh /tmp/tencentos_efa_patch.txt && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh -ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_VERSION="1.22.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" ENV MPI_ROOT=/opt/amazon/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH diff --git a/dockerfiles/base/Dockerfile.ubuntu22.04 b/dockerfiles/base/Dockerfile.ubuntu22.04 index b322cbd..59ea3d6 100644 --- a/dockerfiles/base/Dockerfile.ubuntu22.04 +++ b/dockerfiles/base/Dockerfile.ubuntu22.04 @@ -32,6 +32,7 @@ RUN apt-get update && \ libgl1 \ libgoogle-glog0v5 \ libjemalloc2 \ + libjpeg-dev \ libpq-dev \ lsof \ make \ @@ -60,7 +61,7 @@ RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh -ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_VERSION="1.22.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" ENV MPI_ROOT=/opt/amazon/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH diff --git a/dockerfiles/base/Dockerfile.ubuntu24.04 b/dockerfiles/base/Dockerfile.ubuntu24.04 index 7f47c08..625eb04 100644 --- a/dockerfiles/base/Dockerfile.ubuntu24.04 +++ b/dockerfiles/base/Dockerfile.ubuntu24.04 @@ -32,6 +32,7 @@ RUN apt-get update && \ libgl1 \ libgoogle-glog0v6t64 \ libjemalloc2 \ + libjpeg-dev \ libpq-dev \ lsof \ make \ @@ -60,7 +61,7 @@ RUN python3 -m pip install pip==24.0 setuptools==75.1.0 wheel==0.42.0 --break-sy COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh -ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_VERSION="1.22.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" ENV MPI_ROOT=/opt/amazon/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH diff --git a/dockerfiles/base/install-python310.sh b/dockerfiles/base/install-python310.sh index 78acab4..6dbe65b 100755 --- a/dockerfiles/base/install-python310.sh +++ b/dockerfiles/base/install-python310.sh @@ -10,12 +10,12 @@ case "${_BASE_NAME}" in echo "Skip install Python3.10 from source on Ubuntu22.04" exit 0; ;; - *debian* | *ubuntu*) + *ubuntu*) apt update apt install -y libsqlite3-dev libreadline-dev ;; *rhel*) - yum install -y sqlite-devel readline-devel xz-devel + dnf install -y sqlite-devel readline-devel xz-devel ;; *tencentos3.1*) dnf install -y sqlite-devel readline-devel zlib-devel xz-devel bzip2-devel libffi-devel @@ -28,21 +28,6 @@ case "${_BASE_NAME}" in make && make install ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem - PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin - LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH - _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w" - ;; - *amzn2*) - yum install -y sqlite-devel readline-devel - wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && \ - cd /opt/ && \ - tar xzf openssl-1.1.1w.tar.gz && \ - rm -rf openssl-1.1.1w.tar.gz && \ - cd openssl-1.1.1w && \ - ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && \ - make && make install - ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem - PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w" @@ -74,16 +59,6 @@ case "${_BASE_NAME}" in alternatives --set python3 /usr/local/bin/python3.10 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ;; - *amzn2*) - update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 - ;; - *debian*) - update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 - update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.8 2 - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 - ;; esac python3 -m pip install --upgrade pip setuptools diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk index 6e29640..27f09c3 100644 --- a/dockerfiles/common.mk +++ b/dockerfiles/common.mk @@ -5,9 +5,9 @@ BUILD_OS ?= ubuntu22.04 BUILD_DIR ?= $(CURDIR)/dockerbuild REPO_SERVER ?= vault.habana.ai -PT_VERSION ?= 2.4.0 -RELEASE_VERSION ?= 1.18.0 -RELEASE_BUILD_ID ?= 524 +PT_VERSION ?= 2.5.1 +RELEASE_VERSION ?= 1.19.0 +RELEASE_BUILD_ID ?= 561 BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) diff --git a/dockerfiles/pytorch/Dockerfile.amzn2 b/dockerfiles/pytorch/Dockerfile.amzn2 deleted file mode 100644 index 9d7ff28..0000000 --- a/dockerfiles/pytorch/Dockerfile.amzn2 +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2023 HabanaLabs, Ltd. -# -# SPDX-License-Identifier: Apache-2.0 -# -# HabanaLabs Dockerfile PyTorch installer layer for Amazon Linux 2 -ARG BASE_NAME -ARG VERSION -ARG REVISION -FROM ${BASE_NAME}:${VERSION}-${REVISION} -ARG PT_VERSION -ARG VERSION -ARG REVISION -ARG BASE_NAME -ARG ARTIFACTORY_URL -ARG GPERFTOOLS_PREFIX="/usr/local" -ARG GPERFTOOLS="gperftools-2.7" - -ENV PYTHONPATH=/root:/usr/lib/habanalabs/ - -RUN yum install -y \ - curl \ - redhat-lsb-core \ - numactl-devel \ - cairo-devel \ - iproute \ - libjpeg-devel \ - zlib-devel \ - lapack-devel \ - openblas-devel \ - pdsh \ - numactl \ - yum-utils \ - libmkl-dev && \ - yum clean all - -RUN amazon-linux-extras install epel -y -RUN yum install -y \ - moreutils && \ - yum clean all - -# Since there is an open bug in gperftools 2.6 installed from yum install, -# Hence, Compile & Install gperftools 2.7. Later it might be removed and installed through yum install. -RUN wget --no-verbose https://github.com/gperftools/gperftools/releases/download/${GPERFTOOLS}/${GPERFTOOLS}.tar.gz && \ - tar -xvf ${GPERFTOOLS}.tar.gz && \ - cd ${GPERFTOOLS} && \ - ./configure --prefix="${GPERFTOOLS_PREFIX}" && \ - make -j && \ - make install && \ - ln -s /usr/local/include/google /usr/include/gperftools && \ - cd - && \ - rm -rf ${GPERFTOOLS}* && \ - /sbin/ldconfig - -# Install and configure GCC 11 -RUN /usr/bin/python2 `which yum-config-manager` --add-repo http://archive.kernel.org/centos-vault/centos/7/sclo/x86_64/rh/ && \ - wget http://archive.kernel.org/centos-vault/centos/7/os/x86_64/Packages/libgfortran5-8.3.1-2.1.1.el7.x86_64.rpm && \ - yum install libgfortran5-8.3.1-2.1.1.el7.x86_64.rpm -y && \ - rm -f libgfortran5-8.3.1-2.1.1.el7.x86_64.rpm && \ - sudo yum install -y devtoolset-11 --nogpgcheck && \ - yum clean all && rm -rf /var/cache/yum && \ - sed -i '/# define _GLIBCXX_USE_CXX11_ABI 0/c\# define _GLIBCXX_USE_CXX11_ABI 1' /opt/rh/devtoolset-11/root/usr/include/c++/11/x86_64-redhat-linux/bits/c++config.h - -ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:${PATH} -ENV MANPATH=/opt/rh/devtoolset-11/root/usr/share/man:${MANPATH} -ENV INFOPATH=/opt/rh/devtoolset-11/root/usr/share/info:${INFOPATH} -ENV PCP_DIR=/opt/rh/devtoolset-11/root -ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH} -ENV PKG_CONFIG_PATH=/opt/rh/devtoolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH} - -COPY install_packages.sh . - -RUN ./install_packages.sh && rm -f install_packages.sh && \ - /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc - -ENV LD_PRELOAD="${GPERFTOOLS_PREFIX}/lib/libtcmalloc.so" -ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 -ENV LDFLAGS="-Wl,--copy-dt-needed-entries" - -RUN rm -rf /tmp/* \ No newline at end of file diff --git a/dockerfiles/pytorch/Dockerfile.rhel8.6 b/dockerfiles/pytorch/Dockerfile.rhel8.6 index 7ec8dc6..2193edc 100644 --- a/dockerfiles/pytorch/Dockerfile.rhel8.6 +++ b/dockerfiles/pytorch/Dockerfile.rhel8.6 @@ -17,7 +17,7 @@ LABEL name="PyTorch Installer" LABEL summary="Habanalabs PyTorch installer layer for RHEL8.6" LABEL description="Image with pre installed Habanalabs packages for PyTorch" -RUN echo "/usr/lib/habanalabs" > $(python3.8 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth +RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth RUN dnf install -y \ curl \ @@ -32,11 +32,7 @@ RUN dnf install -y \ pdsh \ gcc-toolset-11 \ gperftools-devel && \ - dnf clean all && rm -rf /var/cache/yum - -RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ - dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all COPY install_packages.sh . @@ -52,6 +48,4 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11 ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH} ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 -ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 - -RUN rm -rf /tmp/* \ No newline at end of file +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 \ No newline at end of file diff --git a/dockerfiles/pytorch/Dockerfile.rhel9.2 b/dockerfiles/pytorch/Dockerfile.rhel9.2 index b504f62..1d2dd4d 100644 --- a/dockerfiles/pytorch/Dockerfile.rhel9.2 +++ b/dockerfiles/pytorch/Dockerfile.rhel9.2 @@ -17,7 +17,7 @@ LABEL name="PyTorch Installer" LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2" LABEL description="Image with pre installed Habanalabs packages for PyTorch" -RUN echo "/usr/lib/habanalabs" > $(python3.10 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth +RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ @@ -26,21 +26,17 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo RUN dnf install --allowerasing -y \ - curl \ - cairo-devel \ - numactl-devel \ - iproute \ - which \ - zlib-devel \ - lapack-devel \ - openblas-devel \ - numactl \ - gperftools-devel && \ - dnf clean all && rm -rf /var/cache/yum - -RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ - dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ - dnf clean all && rm -rf /var/cache/yum + curl \ + cairo-devel \ + numactl-devel \ + iproute \ + which \ + zlib-devel \ + lapack-devel \ + openblas-devel \ + numactl \ + gperftools-devel && \ + dnf clean all COPY install_packages.sh . @@ -50,6 +46,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \ # Set LD_PRELOAD after all required installations to # avoid warnings during docker creation ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 -ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 - -RUN rm -rf /tmp/* \ No newline at end of file +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 \ No newline at end of file diff --git a/dockerfiles/pytorch/Dockerfile.rhel9.4 b/dockerfiles/pytorch/Dockerfile.rhel9.4 index d09fafe..99a2270 100644 --- a/dockerfiles/pytorch/Dockerfile.rhel9.4 +++ b/dockerfiles/pytorch/Dockerfile.rhel9.4 @@ -17,7 +17,7 @@ LABEL name="PyTorch Installer" LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4" LABEL description="Image with pre installed Habanalabs packages for PyTorch" -RUN echo "/usr/lib/habanalabs" > $(python3.11 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt +RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ @@ -25,31 +25,17 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo -RUN dnf install --allowerasing -y \ - curl-7.76.1-29.el9_4.1 \ - cairo-devel \ - numactl-devel \ - iproute \ - which \ - zlib-devel \ - lapack-devel \ - openblas-devel \ - numactl \ - gperftools-devel && \ - dnf clean all && rm -rf /var/cache/yum - -RUN echo "[oneAPI]" >> /etc/yum.repos.d/oneAPI.repo && \ - echo "name=IntelĀ® oneAPI repository" >> /etc/yum.repos.d/oneAPI.repo && \ - echo "baseurl=https://yum.repos.intel.com/oneapi" >> /etc/yum.repos.d/oneAPI.repo && \ - echo 'enabled=1' >> /etc/yum.repos.d/oneAPI.repo && \ - echo "gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \ - echo "repo_gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \ - echo "gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" >> /etc/yum.repos.d/oneAPI.repo - -RUN dnf install --allowerasing -y intel-oneapi-mkl-2024.2.0 && \ - dnf clean all && rm -rf /var/cache/yum - -ENV LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.2/lib:${LD_LIBRARY_PATH} +RUN dnf update -y && dnf install --nodocs --setopt=install_weak_deps=false --allowerasing -y \ + cairo-devel \ + numactl-devel \ + iproute \ + which \ + zlib-devel \ + lapack-devel \ + openblas-devel \ + numactl \ + gperftools-devel && \ + dnf clean all COPY install_packages.sh . @@ -59,6 +45,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \ # Set LD_PRELOAD after all required installations to # avoid warnings during docker creation ENV LD_PRELOAD=/lib64/libtcmalloc.so.4 -ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 - -RUN rm -rf /tmp/* \ No newline at end of file +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 \ No newline at end of file diff --git a/dockerfiles/pytorch/Dockerfile.suse15.5 b/dockerfiles/pytorch/Dockerfile.suse15.5 index 8fe9f54..b9a3617 100644 --- a/dockerfiles/pytorch/Dockerfile.suse15.5 +++ b/dockerfiles/pytorch/Dockerfile.suse15.5 @@ -21,18 +21,10 @@ LABEL description="Image with pre installed Habanalabs packages for PyTorch" ENV PYTHONPATH=/root:/usr/lib/habanalabs/ RUN zypper install -y --allow-downgrade \ - cairo-devel \ - numactl \ - lapack-devel \ - numactl \ - gperftools-devel - -RUN zypper addrepo -f https://yum.repos.intel.com/oneapi oneAPI && \ - echo "gpgcheck=0" >> /etc/zypp/repos.d/oneAPI.repo && \ - echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/oneAPI.repo - -RUN zypper install -y intel-oneapi-mkl-2021.1.1 intel-oneapi-mkl-devel-2021.1.1 - + cairo-devel \ + numactl \ + lapack-devel \ + gperftools-devel COPY install_packages.sh . @@ -42,6 +34,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \ # Set LD_PRELOAD after all required installations to # avoid warnings during docker creation ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 -ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 - -RUN rm -rf /tmp/* \ No newline at end of file +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 \ No newline at end of file diff --git a/dockerfiles/pytorch/Dockerfile.tencentos3.1 b/dockerfiles/pytorch/Dockerfile.tencentos3.1 index 9a98ea6..4be3703 100644 --- a/dockerfiles/pytorch/Dockerfile.tencentos3.1 +++ b/dockerfiles/pytorch/Dockerfile.tencentos3.1 @@ -20,19 +20,19 @@ LABEL description="Image with pre installed Habanalabs packages for PyTorch" ENV PYTHONPATH=/root:/usr/lib/habanalabs/ RUN dnf install -y \ - curl \ - cairo-devel \ - numactl-devel \ - iproute \ - which \ - zlib-devel \ - lapack-devel \ - openblas-devel \ - numactl \ - pdsh \ - gcc-toolset-11 \ - gperftools-devel && \ - dnf clean all && rm -rf /var/cache/yum + curl \ + cairo-devel \ + numactl-devel \ + iproute \ + which \ + zlib-devel \ + lapack-devel \ + openblas-devel \ + numactl \ + pdsh \ + gcc-toolset-11 \ + gperftools-devel && \ + dnf clean all # Configure GCC 11 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH} @@ -42,10 +42,6 @@ ENV PCP_DIR=/opt/rh/gcc-toolset-11/root ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH} ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH} -RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ - dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ - dnf clean all && rm -rf /var/cache/yum - COPY install_packages.sh . RUN ./install_packages.sh && rm -f install_packages.sh && \ diff --git a/dockerfiles/pytorch/Dockerfile.ubuntu b/dockerfiles/pytorch/Dockerfile.ubuntu index 7f52a20..b3c5a3c 100644 --- a/dockerfiles/pytorch/Dockerfile.ubuntu +++ b/dockerfiles/pytorch/Dockerfile.ubuntu @@ -16,23 +16,18 @@ ARG ARTIFACTORY_URL ENV PYTHONPATH=/root:/usr/lib/habanalabs/ RUN apt-get update && apt-get install -y \ - curl \ - libcurl4 \ - moreutils \ - iproute2 \ - libcairo2-dev \ - libglib2.0-dev \ - libhdf5-dev \ - libselinux1-dev \ - libnuma-dev \ - libpcre2-dev \ - libjpeg-dev \ - liblapack-dev \ - libopenblas-dev \ - numactl \ - pdsh \ - libmkl-dev \ - libgoogle-perftools-dev && \ + curl \ + libcurl4 \ + moreutils \ + iproute2 \ + libhdf5-dev \ + libnuma-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + numactl \ + pdsh \ + libgoogle-perftools-dev && \ apt-get clean && rm -rf /var/lib/apt/lists/* RUN bash -c "\ @@ -48,6 +43,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \ /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 -ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 - -RUN rm -rf /tmp/* \ No newline at end of file +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 \ No newline at end of file diff --git a/dockerfiles/pytorch/install_packages.sh b/dockerfiles/pytorch/install_packages.sh index 396ab29..4c8f75d 100755 --- a/dockerfiles/pytorch/install_packages.sh +++ b/dockerfiles/pytorch/install_packages.sh @@ -16,9 +16,6 @@ case "${BASE_NAME}" in *rhel8*) os_string="rhel86" ;; - *amzn2*) - os_string="amzn2" - ;; *tencentos*) os_string="tencentos31" ;; diff --git a/utils/check_framework_env.py b/utils/check_framework_env.py index c12bf28..599f780 100755 --- a/utils/check_framework_env.py +++ b/utils/check_framework_env.py @@ -36,7 +36,8 @@ def pytorch_test(device_id=0): device_id (int, optional): ID of Intel Gaudi. Defaults to 0. """ - os.environ["ID"] = str(device_id) + os.environ["HLS_MODULE_ID"] = str(device_id) + os.environ["HABANA_VISIBLE_MODULES"] = str(device_id) try: import torch @@ -52,20 +53,27 @@ def pytorch_test(device_id=0): assert y == 4, 'Sanity check failed: Wrong Add output' assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card' except (RuntimeError, AssertionError) as e: - print(f"Card {device_id} Failure: {e}") + print(f"Card Module ID {device_id} Failure: {e}") raise + return device_id if __name__ == '__main__': args = parse_arguments() + passed_cards = set() - try: - with concurrent.futures.ProcessPoolExecutor() as executor: - for device_id, res in zip(range(args.cards), executor.map(pytorch_test, range(args.cards))): - print(f"Card {device_id} PASSED") - except Exception as e: - print(f"Failed to initialize on Intel Gaudi, error: {str(e)}") - print(f"Check FAILED") - exit(1) + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = [executor.submit(pytorch_test, device_id) for device_id in range(args.cards)] + for future in concurrent.futures.as_completed(futures): + try: + dev_id = future.result() + passed_cards.add(dev_id) + print(f"Card module_id {dev_id} PASSED") + + except Exception as e: + print(f"Failed to initialize on Intel Gaudi, error: {str(e)}") + + failed_cards = set(range(args.cards)) - passed_cards - print(f"Check PASSED for {args.cards} cards") \ No newline at end of file + print(f"Failed cards Module ID: {failed_cards}") + print(f"Passed cards Module ID: {passed_cards}") \ No newline at end of file diff --git a/utils/intel_gaudi_health_screen/IGNodes.py b/utils/intel_gaudi_health_screen/IGNodes.py index 865a46a..6bebfde 100644 --- a/utils/intel_gaudi_health_screen/IGNodes.py +++ b/utils/intel_gaudi_health_screen/IGNodes.py @@ -270,6 +270,7 @@ def check_device_acquire_fail(self): self.device_acquire_fail = False os.environ["ID"] = str(self.module_id) + os.environ["HABANA_VISIBLE_MODULES"] = str(self.module_id) try: import torch diff --git a/utils/intel_gaudi_health_screen/README.md b/utils/intel_gaudi_health_screen/README.md index f0a537c..143bed4 100644 --- a/utils/intel_gaudi_health_screen/README.md +++ b/utils/intel_gaudi_health_screen/README.md @@ -149,7 +149,7 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest" +image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" # Node Label used to identify a Intel Gaudi Node gaudi-node-label: "habana.ai/gaudi:NoSchedule" @@ -233,7 +233,7 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest" +image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" # Node Label used to identify a Intel Gaudi Node gaudi-node-label: "habana.ai/gaudi:NoSchedule" diff --git a/utils/intel_gaudi_health_screen/config.yaml b/utils/intel_gaudi_health_screen/config.yaml index f3aef5b..fcac869 100644 --- a/utils/intel_gaudi_health_screen/config.yaml +++ b/utils/intel_gaudi_health_screen/config.yaml @@ -12,7 +12,7 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest" +image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest" # Node Label used to identify a Intel Gaudi Node gaudi-node-label: "habana.ai/gaudi:NoSchedule" diff --git a/utils/intel_gaudi_health_screen/system_utils.py b/utils/intel_gaudi_health_screen/system_utils.py index 76551eb..fe41b18 100644 --- a/utils/intel_gaudi_health_screen/system_utils.py +++ b/utils/intel_gaudi_health_screen/system_utils.py @@ -29,6 +29,12 @@ def __init__(self, image, log_dir, remote_path="/tmp/ighs"): self.log_dir = log_dir self.remote_path = remote_path + def clear_tmp_ighs(self): + ighs_path = "/tmp/intel_gaudi_health_screen" + + _logger.info(f"Clearing out {ighs_path}") + shutil.rmtree(ighs_path) + def clear_jobs(self): if not os.path.exists(self.job_path): os.makedirs(self.job_path) @@ -57,6 +63,7 @@ def __init__(self, image, hostfile, namespace, log_dir): self.hostfile = hostfile def initialize_system(self): + self.clear_tmp_ighs() self.clear_ighs_pods() self.clear_ighs_pods(job_type="mpijobs") self.clear_jobs() @@ -145,6 +152,11 @@ def initialize_node_jobs(self, level, return nodes_initialized def cp_ighs(self, namespace, cwd, metadata_app): + def ignore_dirs(dir, contents): + return [f for f in contents if os.path.isdir(os.path.join(dir, f)) and f in ["logs", ".git"]] + + shutil.copytree(cwd, f"/tmp/intel_gaudi_health_screen", ignore=ignore_dirs, dirs_exist_ok=True) + pods_done = dict() cmd = f"kubectl get pods -n {namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name,STATUS:.status.phase' --no-headers" output = run_cmd(cmd).strip() @@ -159,7 +171,7 @@ def cp_ighs(self, namespace, cwd, metadata_app): for p in pods: p_name, state = p.split() if p_name not in pods_done and state == "Running": - cmd = f"kubectl cp -n {namespace} {cwd} {p_name}:/workdir/intel_gaudi_health_screen" + cmd = f"kubectl cp -n {namespace} /tmp/intel_gaudi_health_screen {p_name}:/workdir/intel_gaudi_health_screen" output = run_cmd(cmd).strip() pods_done[p_name] = True diff --git a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml index d1f6941..a79e420 100644 --- a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml +++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml @@ -42,7 +42,7 @@ spec: while [ ! -d /workdir/intel_gaudi_health_screen ]; do sleep 2s; done; - sleep 2s; + sleep 10s; cd /workdir/intel_gaudi_health_screen; python /workdir/intel_gaudi_health_screen/screen.py --ighs-check node --logs-dir $LOG_DIR; @@ -72,8 +72,10 @@ spec: limits: habana.ai/gaudi: 8 hugepages-2Mi: 29000Mi + memory: 200Gi cpu: 95 requests: habana.ai/gaudi: 8 hugepages-2Mi: 29000Mi + memory: 200Gi cpu: 95 diff --git a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml index 04c50c0..7078ea0 100644 --- a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml +++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml @@ -56,7 +56,7 @@ spec: while [ ! -d /workdir/intel_gaudi_health_screen ]; do sleep 2s; done; - sleep 2s; + sleep 10s; declare -xr HOSTSFILE=$OMPI_MCA_orte_default_hostfile; @@ -131,9 +131,11 @@ spec: habana.ai/gaudi: 8 hugepages-2Mi: 29000Mi cpu: 95 + memory: 200Gi requests: habana.ai/gaudi: 8 hugepages-2Mi: 29000Mi + memory: 200Gi cpu: 95 volumeMounts: - name: mydir diff --git a/utils/intel_gaudi_health_screen/utilities.py b/utils/intel_gaudi_health_screen/utilities.py index cfcd893..fb80e63 100644 --- a/utils/intel_gaudi_health_screen/utilities.py +++ b/utils/intel_gaudi_health_screen/utilities.py @@ -74,7 +74,7 @@ def create_logger(logger_name, logger_file_name, f_path="", level=logging.INFO, return t_logger, d_path -def run_cmd(cmd, timeout_s=1_800, verbose=False): +def run_cmd(cmd, timeout_s=900, verbose=False): """ Run Command through subprocess.run() Args: