From 5ce1081420d2718e8381dccf75da694909aa2567 Mon Sep 17 00:00:00 2001
From: Igor Gambarin <igambarin@habana.ai>
Date: Fri, 20 Dec 2024 00:54:04 +0200
Subject: [PATCH] SynapseAi 1.19.0 release  * Update dockerfiles with 1.19.0
 content

---
 dockerfiles/base/Dockerfile.amzn2             | 105 ------------------
 dockerfiles/base/Dockerfile.rhel8.6           |  19 ++--
 dockerfiles/base/Dockerfile.rhel9.2           |   5 +-
 dockerfiles/base/Dockerfile.rhel9.4           |  33 +++---
 dockerfiles/base/Dockerfile.suse15.5          |  21 ++--
 dockerfiles/base/Dockerfile.tencentos3.1      |   3 +-
 dockerfiles/base/Dockerfile.ubuntu22.04       |   3 +-
 dockerfiles/base/Dockerfile.ubuntu24.04       |   3 +-
 dockerfiles/base/install-python310.sh         |  29 +----
 dockerfiles/common.mk                         |   6 +-
 dockerfiles/pytorch/Dockerfile.amzn2          |  79 -------------
 dockerfiles/pytorch/Dockerfile.rhel8.6        |  12 +-
 dockerfiles/pytorch/Dockerfile.rhel9.2        |  32 +++---
 dockerfiles/pytorch/Dockerfile.rhel9.4        |  42 +++----
 dockerfiles/pytorch/Dockerfile.suse15.5       |  20 +---
 dockerfiles/pytorch/Dockerfile.tencentos3.1   |  30 +++--
 dockerfiles/pytorch/Dockerfile.ubuntu         |  33 +++---
 dockerfiles/pytorch/install_packages.sh       |   3 -
 utils/check_framework_env.py                  |  30 +++--
 utils/intel_gaudi_health_screen/IGNodes.py    |   1 +
 utils/intel_gaudi_health_screen/README.md     |   4 +-
 utils/intel_gaudi_health_screen/config.yaml   |   2 +-
 .../intel_gaudi_health_screen/system_utils.py |  14 ++-
 .../k8s/intel-gaudi-health-screen-L1.yaml     |   4 +-
 ...ntel-gaudi-health-screen-L2_hccl-demo.yaml |   4 +-
 utils/intel_gaudi_health_screen/utilities.py  |   2 +-
 26 files changed, 155 insertions(+), 384 deletions(-)
 delete mode 100644 dockerfiles/base/Dockerfile.amzn2
 delete mode 100644 dockerfiles/pytorch/Dockerfile.amzn2

diff --git a/dockerfiles/base/Dockerfile.amzn2 b/dockerfiles/base/Dockerfile.amzn2
deleted file mode 100644
index dfc548d..0000000
--- a/dockerfiles/base/Dockerfile.amzn2
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2023 Habana Labs, Ltd.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# HabanaLabs Dockerfile base installer layer for Amazon Linux 2
-FROM amazonlinux:2
-ARG ARTIFACTORY_URL
-ARG VERSION
-ARG REVISION
-
-RUN amazon-linux-extras enable python3.8 && \
-    yum update -y && yum install -y \
-    git \
-    unzip \
-    ethtool-4.8-10.amzn2.x86_64 \
-    openssh-clients \
-    openssh-server \
-    bzip2-devel \
-    python38 \
-    python38-devel \
-    python38-pip \
-    python38-tkinter \
-    which \
-    wget \
-    lsof \
-    tar \
-    mesa-libGL \
-    sox-devel && \
-    yum clean all && rm -rf /var/cache/yum && \
-    rm -f /etc/ssh/ssh_host_*_key*
-
-# Install jemalloc-3.6.0-1.el7.x86_64 package with required /lib64/libjemalloc.so.1 lib need for topologies
-RUN yum install -y https://archives.fedoraproject.org/pub/archive/epel/7/x86_64/Packages/e/epel-release-7-14.noarch.rpm && \
-    yum install -y jemalloc && \
-    yum clean all && rm -rf /var/cache/yum
-
-# Install development tools and cmake for habana-horovod compilation sdist package
-RUN yum groupinstall -y "Development Tools"
-RUN yum install -y sudo system-lsb-core cmake
-
-COPY install_efa.sh .
-RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
-
-ENV LIBFABRIC_VERSION="1.22.0"
-ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
-ENV MPI_ROOT=/opt/amazon/openmpi
-ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
-ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH
-ENV OPAL_PREFIX=${MPI_ROOT}
-ENV MPICC=${MPI_ROOT}/bin/mpicc
-ENV FI_EFA_FORK_SAFE=1
-ENV RDMAV_FORK_SAFE=1
-ENV FI_EFA_USE_DEVICE_RDMA=1
-ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
-ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
-
-RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
-    echo "name=Habana AWS Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
-    echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2" >> /etc/yum.repos.d/habanalabs.repo && \
-    echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo
-
-RUN yum makecache && \
-    yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \
-    yum install -y habanalabs-thunk-"$VERSION"-"$REVISION".amzn2 && \
-    yum install -y habanalabs-firmware-tools-"$VERSION"-"$REVISION".amzn2 && \
-    yum install -y habanalabs-graph-"$VERSION"-"$REVISION".amzn2 && \
-    rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph && \
-    rm -f /etc/yum.repos.d/habanalabs.repo && \
-    yum clean all && rm -rf /var/cache/yum
-
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \
-    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
-
-# SSH configuration necessary to support mpi-operator v2
-RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
-    sed -i 's/[ #]\(.*ForwardAgent \).*/ \1yes/g' /etc/ssh/ssh_config && \
-    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
-    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
-    mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc
-
-# There is no need to store pip installation files inside docker image
-ENV PIP_NO_CACHE_DIR=on
-ENV PIP_DISABLE_PIP_VERSION_CHECK=1
-
-RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \
-    cd /tmp/ && tar xf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \
-    cd /tmp/libfabric-${LIBFABRIC_VERSION} && \
-    ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \
-    make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION}
-
-RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \
-    unzip /tmp/main.zip -d /tmp && \
-    cd /tmp/hccl_ofi_wrapper-main && \
-    make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \
-    cd / && \
-    rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main
-
-RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
-
-RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
-
-ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
-ENV HABANA_LOGS=/var/log/habana_logs/
-ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
-ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
\ No newline at end of file
diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel8.6
index 2e836c7..2192035 100644
--- a/dockerfiles/base/Dockerfile.rhel8.6
+++ b/dockerfiles/base/Dockerfile.rhel8.6
@@ -37,7 +37,8 @@ RUN dnf install -y \
     unzip \
     llvm \
     lsof \
-    python38-devel \
+    python3.11-devel \
+    python3.11-pip \
     bzip2 \
     bzip2-devel \
     openssh-clients \
@@ -93,6 +94,11 @@ RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \
     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/powertools.repo && \
     echo "gpgcheck=1" >> /etc/yum.repos.d/powertools.repo
 
+ENV PYTHON_VERSION=3.11
+RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
+    alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \
+    alternatives --set python3 /usr/bin/python3.11
+
 RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \
         habanalabs-thunk-"$VERSION"-"$REVISION".el8 \
         habanalabs-firmware-tools-"$VERSION"-"$REVISION".el8 \
@@ -127,14 +133,9 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi
     cd / && \
     rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main
 
-ENV PYTHON_VERSION=3.8
-RUN python3.8 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
-
-RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \
-    alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \
-    alternatives --set python3 /usr/bin/python3.8
+RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
 
-RUN python3.8 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
+RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
 
 # SSH configuration necessary to support mpi-operator v2
 RUN mkdir -p /var/run/sshd && \
@@ -147,4 +148,4 @@ RUN mkdir -p /var/run/sshd && \
 ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
 ENV HABANA_LOGS=/var/log/habana_logs/
 ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw
-ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
+ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
\ No newline at end of file
diff --git a/dockerfiles/base/Dockerfile.rhel9.2 b/dockerfiles/base/Dockerfile.rhel9.2
index 4900b21..273fb7f 100644
--- a/dockerfiles/base/Dockerfile.rhel9.2
+++ b/dockerfiles/base/Dockerfile.rhel9.2
@@ -68,6 +68,7 @@ RUN dnf install -y \
 ENV PYTHON_VERSION=3.10
 COPY install-python310.sh .
 RUN ./install-python310.sh rhel9.2 && rm install-python310.sh
+RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig
 ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 
 COPY install_efa.sh .
@@ -128,11 +129,11 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi
     cd / && \
     rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main
 
-RUN python3.10 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
+RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
 
 RUN ln -s /usr/bin/python3 /usr/bin/python
 
-RUN python3.10 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
+RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
 
 # SSH configuration necessary to support mpi-operator v2
 RUN mkdir -p /var/run/sshd && \
diff --git a/dockerfiles/base/Dockerfile.rhel9.4 b/dockerfiles/base/Dockerfile.rhel9.4
index a00aa3a..654def2 100644
--- a/dockerfiles/base/Dockerfile.rhel9.4
+++ b/dockerfiles/base/Dockerfile.rhel9.4
@@ -31,10 +31,15 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
 
-RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
-    dnf clean all && rm -rf /var/cache/yum
-
 RUN dnf install -y \
+        python3-dnf-plugin-versionlock && \
+    dnf versionlock add redhat-release* && \
+    dnf clean all
+
+RUN dnf update -y && dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    dnf clean all
+
+RUN dnf update -y && dnf install -y \
     clang \
     cmake3 \
     cpp \
@@ -51,8 +56,8 @@ RUN dnf install -y \
     lsof \
     python3-devel \
     openssh-clients \
-    openssl-1:3.0.7-28.el9_4 \
-    openssl-devel-1:3.0.7-28.el9_4 \
+    openssl \
+    openssl-devel \
     libjpeg-devel \
     openssh-server \
     lsb_release \
@@ -68,12 +73,9 @@ RUN dnf install -y \
     python3.11-pip \
     python3.11-devel \
     python3.11-rpm \
-    ffmpeg-free \
-    python3-dnf-plugin-versionlock && \
-    # update pkgs (except OS version) for resolving potentials CVEs
-    dnf versionlock add redhat-release* openssl* libcurl-minimal curl-minimal ima-evm-utils python3-rpm rpm* && \
-    dnf update -y && \
-    dnf clean all && rm -rf /var/cache/yum && \
+    ffmpeg-free && \
+    dnf versionlock add python3-rpm rpm* && \
+    dnf clean all && \
     rm -f /etc/ssh/ssh_host_*_key*
 
 RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
@@ -111,8 +113,9 @@ RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \
         habanalabs-thunk-"$VERSION"-"$REVISION".el9 \
         habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \
         habanalabs-graph-"$VERSION"-"$REVISION".el9 && \
-    rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \
-    dnf clean all && rm -rf /var/cache/yum
+    chmod +t /var/log/habana_logs && \
+    rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && \
+    dnf clean all
 
 RUN rpm -V habanalabs-rdma-core && rpm -V habanalabs-thunk && rpm -V habanalabs-firmware-tools && rpm -V habanalabs-graph
 
@@ -141,11 +144,11 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi
     cd / && \
     rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main
 
-RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
+RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
 
 RUN ln -s /usr/bin/python3 /usr/bin/python
 
-RUN python3.11 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
+RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
 
 # SSH configuration necessary to support mpi-operator v2
 RUN mkdir -p /var/run/sshd && \
diff --git a/dockerfiles/base/Dockerfile.suse15.5 b/dockerfiles/base/Dockerfile.suse15.5
index f53bd3c..8def199 100644
--- a/dockerfiles/base/Dockerfile.suse15.5
+++ b/dockerfiles/base/Dockerfile.suse15.5
@@ -44,7 +44,8 @@ RUN zypper install -y --allow-downgrade \
     Mesa-libGL1 \
     openssh-clients \
     openssh-server \
-    openssl openssl-devel \
+    openssl \
+    openssl-devel \
     python311 \
     python311-devel \
     python311-pip \
@@ -53,11 +54,14 @@ RUN zypper install -y --allow-downgrade \
     zlib-devel && \
     rm -f /etc/ssh/ssh_host_*_key*
 
+ENV PIP_NO_CACHE_DIR=on
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+
 RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
     alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \
-    alternatives --set python3 /usr/bin/python3.11 && \
-    alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 1 && \
-    alternatives --set pip3 /usr/bin/pip3.11
+    alternatives --set python3 /usr/bin/python3.11
+
+RUN python3 -m pip install setuptools==75.1.0 wheel==0.44.0
 
 COPY install_efa.sh .
 RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
@@ -82,10 +86,8 @@ RUN zypper --gpg-auto-import-keys install -y habanalabs-rdma-core-"$VERSION"-"$R
         habanalabs-thunk-"$VERSION"-"$REVISION" \
         habanalabs-firmware-tools-"$VERSION"-"$REVISION" \
         habanalabs-graph-"$VERSION"-"$REVISION" && \
-    rm -f /etc/yum.repos.d/habanalabs.repo
+    rm -f /etc/zypp/repos.d/habanalabs.repo
 
-ENV PIP_NO_CACHE_DIR=on
-ENV PIP_DISABLE_PIP_VERSION_CHECK=1
 ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src
 ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
 
@@ -102,10 +104,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi
     cd / && \
     rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main
 
-RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
-
-
-RUN python3.11 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
+RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}"
 
 # SSH configuration necessary to support mpi-operator v2
 RUN mkdir -p /var/run/sshd && \
diff --git a/dockerfiles/base/Dockerfile.tencentos3.1 b/dockerfiles/base/Dockerfile.tencentos3.1
index c5a28a7..2743b9a 100644
--- a/dockerfiles/base/Dockerfile.tencentos3.1
+++ b/dockerfiles/base/Dockerfile.tencentos3.1
@@ -47,13 +47,14 @@ RUN dnf install -y \
 
 COPY install-python310.sh .
 RUN ./install-python310.sh tencentos3.1 && rm install-python310.sh
+RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/python.conf && ldconfig
 ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 
 COPY install_efa.sh .
 COPY tencentos_efa_patch.txt /tmp/tencentos_efa_patch.txt
 RUN ./install_efa.sh && rm -f install_efa.sh /tmp/tencentos_efa_patch.txt && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 
-ENV LIBFABRIC_VERSION="1.20.0"
+ENV LIBFABRIC_VERSION="1.22.0"
 ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
 ENV MPI_ROOT=/opt/amazon/openmpi
 ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
diff --git a/dockerfiles/base/Dockerfile.ubuntu22.04 b/dockerfiles/base/Dockerfile.ubuntu22.04
index b322cbd..59ea3d6 100644
--- a/dockerfiles/base/Dockerfile.ubuntu22.04
+++ b/dockerfiles/base/Dockerfile.ubuntu22.04
@@ -32,6 +32,7 @@ RUN apt-get update && \
     libgl1 \
     libgoogle-glog0v5 \
     libjemalloc2 \
+    libjpeg-dev \
     libpq-dev \
     lsof \
     make \
@@ -60,7 +61,7 @@ RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
 COPY install_efa.sh .
 RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 
-ENV LIBFABRIC_VERSION="1.20.0"
+ENV LIBFABRIC_VERSION="1.22.0"
 ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
 ENV MPI_ROOT=/opt/amazon/openmpi
 ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
diff --git a/dockerfiles/base/Dockerfile.ubuntu24.04 b/dockerfiles/base/Dockerfile.ubuntu24.04
index 7f47c08..625eb04 100644
--- a/dockerfiles/base/Dockerfile.ubuntu24.04
+++ b/dockerfiles/base/Dockerfile.ubuntu24.04
@@ -32,6 +32,7 @@ RUN apt-get update && \
     libgl1 \
     libgoogle-glog0v6t64 \
     libjemalloc2 \
+    libjpeg-dev \
     libpq-dev \
     lsof \
     make \
@@ -60,7 +61,7 @@ RUN python3 -m pip install pip==24.0 setuptools==75.1.0 wheel==0.42.0 --break-sy
 COPY install_efa.sh .
 RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
 
-ENV LIBFABRIC_VERSION="1.20.0"
+ENV LIBFABRIC_VERSION="1.22.0"
 ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
 ENV MPI_ROOT=/opt/amazon/openmpi
 ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
diff --git a/dockerfiles/base/install-python310.sh b/dockerfiles/base/install-python310.sh
index 78acab4..6dbe65b 100755
--- a/dockerfiles/base/install-python310.sh
+++ b/dockerfiles/base/install-python310.sh
@@ -10,12 +10,12 @@ case "${_BASE_NAME}" in
         echo "Skip install Python3.10 from source on Ubuntu22.04"
         exit 0;
     ;;
-    *debian* | *ubuntu*)
+    *ubuntu*)
         apt update
         apt install -y libsqlite3-dev libreadline-dev
     ;;
     *rhel*)
-        yum install -y sqlite-devel readline-devel xz-devel
+        dnf install -y sqlite-devel readline-devel xz-devel
     ;;
     *tencentos3.1*)
         dnf install -y sqlite-devel readline-devel zlib-devel xz-devel bzip2-devel libffi-devel
@@ -28,21 +28,6 @@ case "${_BASE_NAME}" in
             make && make install
         ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem
 
-        PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin
-        LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH
-        _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w"
-    ;;
-    *amzn2*)
-        yum install -y sqlite-devel readline-devel
-        wget -nv -O /opt/openssl-1.1.1w.tar.gz https://github.com/openssl/openssl/releases/download/OpenSSL_1_1_1w/openssl-1.1.1w.tar.gz && \
-            cd /opt/ && \
-            tar xzf openssl-1.1.1w.tar.gz && \
-            rm -rf openssl-1.1.1w.tar.gz && \
-            cd openssl-1.1.1w && \
-            ./config --prefix=/usr/local/openssl-1.1.1w shared zlib && \
-            make && make install
-        ln -s /etc/pki/tls/cert.pem /usr/local/openssl-1.1.1w/ssl/cert.pem
-
         PATH=$PATH:/usr/local/protoc/bin:/usr/local/openssl-1.1.1w/bin
         LD_LIBRARY_PATH=/usr/local/openssl-1.1.1w/lib:$LD_LIBRARY_PATH
         _SSL_LIB="--with-openssl=/usr/local/openssl-1.1.1w"
@@ -74,16 +59,6 @@ case "${_BASE_NAME}" in
         alternatives --set python3 /usr/local/bin/python3.10
         export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
     ;;
-    *amzn2*)
-        update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3 && \
-        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \
-        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
-    ;;
-    *debian*)
-        update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 3
-        update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.8 2
-        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
-    ;;
 esac
 
 python3 -m pip install --upgrade pip setuptools
diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk
index 6e29640..27f09c3 100644
--- a/dockerfiles/common.mk
+++ b/dockerfiles/common.mk
@@ -5,9 +5,9 @@ BUILD_OS ?= ubuntu22.04
 BUILD_DIR ?= $(CURDIR)/dockerbuild
 
 REPO_SERVER ?= vault.habana.ai
-PT_VERSION ?= 2.4.0
-RELEASE_VERSION ?= 1.18.0
-RELEASE_BUILD_ID ?= 524
+PT_VERSION ?= 2.5.1
+RELEASE_VERSION ?= 1.19.0
+RELEASE_BUILD_ID ?= 561
 
 BASE_IMAGE_URL ?= base-installer-$(BUILD_OS)
 IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID)
diff --git a/dockerfiles/pytorch/Dockerfile.amzn2 b/dockerfiles/pytorch/Dockerfile.amzn2
deleted file mode 100644
index 9d7ff28..0000000
--- a/dockerfiles/pytorch/Dockerfile.amzn2
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2023 HabanaLabs, Ltd.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# HabanaLabs Dockerfile PyTorch installer layer for Amazon Linux 2
-ARG BASE_NAME
-ARG VERSION
-ARG REVISION
-FROM ${BASE_NAME}:${VERSION}-${REVISION}
-ARG PT_VERSION
-ARG VERSION
-ARG REVISION
-ARG BASE_NAME
-ARG ARTIFACTORY_URL
-ARG GPERFTOOLS_PREFIX="/usr/local"
-ARG GPERFTOOLS="gperftools-2.7"
-
-ENV PYTHONPATH=/root:/usr/lib/habanalabs/
-
-RUN yum install -y \
-    curl \
-    redhat-lsb-core \
-    numactl-devel \
-    cairo-devel \
-    iproute \
-    libjpeg-devel \
-    zlib-devel \
-    lapack-devel \
-    openblas-devel \
-    pdsh \
-    numactl \
-    yum-utils \
-    libmkl-dev && \
-    yum clean all
-
-RUN amazon-linux-extras install epel -y
-RUN yum install -y \
-    moreutils && \
-    yum clean all
-
-# Since there is an open bug in gperftools 2.6 installed from yum install,
-# Hence, Compile & Install gperftools 2.7. Later it might be removed and installed through yum install.
-RUN wget --no-verbose https://github.com/gperftools/gperftools/releases/download/${GPERFTOOLS}/${GPERFTOOLS}.tar.gz && \
-    tar -xvf ${GPERFTOOLS}.tar.gz && \
-    cd ${GPERFTOOLS} && \
-    ./configure --prefix="${GPERFTOOLS_PREFIX}" && \
-    make -j && \
-    make install && \
-    ln -s /usr/local/include/google /usr/include/gperftools && \
-    cd - && \
-    rm -rf ${GPERFTOOLS}* && \
-    /sbin/ldconfig
-
-# Install and configure GCC 11
-RUN /usr/bin/python2 `which yum-config-manager` --add-repo http://archive.kernel.org/centos-vault/centos/7/sclo/x86_64/rh/ && \
-    wget http://archive.kernel.org/centos-vault/centos/7/os/x86_64/Packages/libgfortran5-8.3.1-2.1.1.el7.x86_64.rpm && \
-    yum install libgfortran5-8.3.1-2.1.1.el7.x86_64.rpm -y && \
-    rm -f libgfortran5-8.3.1-2.1.1.el7.x86_64.rpm && \
-    sudo yum install -y devtoolset-11 --nogpgcheck && \
-    yum clean all && rm -rf /var/cache/yum && \
-    sed -i '/# define _GLIBCXX_USE_CXX11_ABI 0/c\# define _GLIBCXX_USE_CXX11_ABI 1' /opt/rh/devtoolset-11/root/usr/include/c++/11/x86_64-redhat-linux/bits/c++config.h
-
-ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:${PATH}
-ENV MANPATH=/opt/rh/devtoolset-11/root/usr/share/man:${MANPATH}
-ENV INFOPATH=/opt/rh/devtoolset-11/root/usr/share/info:${INFOPATH}
-ENV PCP_DIR=/opt/rh/devtoolset-11/root
-ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH}
-ENV PKG_CONFIG_PATH=/opt/rh/devtoolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH}
-
-COPY install_packages.sh .
-
-RUN ./install_packages.sh && rm -f install_packages.sh && \
-    /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
-
-ENV LD_PRELOAD="${GPERFTOOLS_PREFIX}/lib/libtcmalloc.so"
-ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
-ENV LDFLAGS="-Wl,--copy-dt-needed-entries"
-
-RUN rm -rf /tmp/*
\ No newline at end of file
diff --git a/dockerfiles/pytorch/Dockerfile.rhel8.6 b/dockerfiles/pytorch/Dockerfile.rhel8.6
index 7ec8dc6..2193edc 100644
--- a/dockerfiles/pytorch/Dockerfile.rhel8.6
+++ b/dockerfiles/pytorch/Dockerfile.rhel8.6
@@ -17,7 +17,7 @@ LABEL name="PyTorch Installer"
 LABEL summary="Habanalabs PyTorch installer layer for RHEL8.6"
 LABEL description="Image with pre installed Habanalabs packages for PyTorch"
 
-RUN echo "/usr/lib/habanalabs" > $(python3.8 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth
+RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth
 
 RUN dnf install -y \
     curl \
@@ -32,11 +32,7 @@ RUN dnf install -y \
     pdsh \
     gcc-toolset-11 \
     gperftools-devel && \
-    dnf clean all && rm -rf /var/cache/yum
-
-RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \
-    dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \
-    dnf clean all && rm -rf /var/cache/yum
+    dnf clean all
 
 COPY install_packages.sh .
 
@@ -52,6 +48,4 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11
 ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH}
 
 ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
-ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
-
-RUN rm -rf /tmp/*
\ No newline at end of file
+ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
\ No newline at end of file
diff --git a/dockerfiles/pytorch/Dockerfile.rhel9.2 b/dockerfiles/pytorch/Dockerfile.rhel9.2
index b504f62..1d2dd4d 100644
--- a/dockerfiles/pytorch/Dockerfile.rhel9.2
+++ b/dockerfiles/pytorch/Dockerfile.rhel9.2
@@ -17,7 +17,7 @@ LABEL name="PyTorch Installer"
 LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2"
 LABEL description="Image with pre installed Habanalabs packages for PyTorch"
 
-RUN echo "/usr/lib/habanalabs" > $(python3.10 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth
+RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pth
 
 RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
@@ -26,21 +26,17 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
 
 RUN dnf install --allowerasing -y \
-    curl \
-    cairo-devel \
-    numactl-devel \
-    iproute \
-    which \
-    zlib-devel \
-    lapack-devel \
-    openblas-devel \
-    numactl \
-    gperftools-devel && \
-    dnf clean all && rm -rf /var/cache/yum
-
-RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \
-    dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \
-    dnf clean all && rm -rf /var/cache/yum
+        curl \
+        cairo-devel \
+        numactl-devel \
+        iproute \
+        which \
+        zlib-devel \
+        lapack-devel \
+        openblas-devel \
+        numactl \
+        gperftools-devel && \
+    dnf clean all
 
 COPY install_packages.sh .
 
@@ -50,6 +46,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \
 # Set LD_PRELOAD after all required installations to
 # avoid warnings during docker creation
 ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
-ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
-
-RUN rm -rf /tmp/*
\ No newline at end of file
+ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
\ No newline at end of file
diff --git a/dockerfiles/pytorch/Dockerfile.rhel9.4 b/dockerfiles/pytorch/Dockerfile.rhel9.4
index d09fafe..99a2270 100644
--- a/dockerfiles/pytorch/Dockerfile.rhel9.4
+++ b/dockerfiles/pytorch/Dockerfile.rhel9.4
@@ -17,7 +17,7 @@ LABEL name="PyTorch Installer"
 LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4"
 LABEL description="Image with pre installed Habanalabs packages for PyTorch"
 
-RUN echo "/usr/lib/habanalabs" > $(python3.11 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt
+RUN echo "/usr/lib/habanalabs" > $(python3 -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt
 
 RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "name=CentOS Linux 9 - CRB" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
@@ -25,31 +25,17 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "gpgkey=https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
     echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
 
-RUN dnf install --allowerasing -y \
-    curl-7.76.1-29.el9_4.1 \
-    cairo-devel \
-    numactl-devel \
-    iproute \
-    which \
-    zlib-devel \
-    lapack-devel \
-    openblas-devel \
-    numactl \
-    gperftools-devel && \
-    dnf clean all && rm -rf /var/cache/yum
-
-RUN echo "[oneAPI]" >> /etc/yum.repos.d/oneAPI.repo && \
-    echo "name=Intel® oneAPI repository" >> /etc/yum.repos.d/oneAPI.repo && \
-    echo "baseurl=https://yum.repos.intel.com/oneapi" >> /etc/yum.repos.d/oneAPI.repo && \
-    echo 'enabled=1'  >> /etc/yum.repos.d/oneAPI.repo && \
-    echo "gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \
-    echo "repo_gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \
-    echo "gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" >> /etc/yum.repos.d/oneAPI.repo
-
-RUN dnf install --allowerasing -y intel-oneapi-mkl-2024.2.0 && \
-    dnf clean all && rm -rf /var/cache/yum
-
-ENV LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.2/lib:${LD_LIBRARY_PATH}
+RUN dnf update -y && dnf install --nodocs --setopt=install_weak_deps=false --allowerasing -y \
+        cairo-devel \
+        numactl-devel \
+        iproute \
+        which \
+        zlib-devel \
+        lapack-devel \
+        openblas-devel \
+        numactl \
+        gperftools-devel && \
+    dnf clean all
 
 COPY install_packages.sh .
 
@@ -59,6 +45,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \
 # Set LD_PRELOAD after all required installations to
 # avoid warnings during docker creation
 ENV LD_PRELOAD=/lib64/libtcmalloc.so.4
-ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
-
-RUN rm -rf /tmp/*
\ No newline at end of file
+ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
\ No newline at end of file
diff --git a/dockerfiles/pytorch/Dockerfile.suse15.5 b/dockerfiles/pytorch/Dockerfile.suse15.5
index 8fe9f54..b9a3617 100644
--- a/dockerfiles/pytorch/Dockerfile.suse15.5
+++ b/dockerfiles/pytorch/Dockerfile.suse15.5
@@ -21,18 +21,10 @@ LABEL description="Image with pre installed Habanalabs packages for PyTorch"
 ENV PYTHONPATH=/root:/usr/lib/habanalabs/
 
 RUN zypper install -y --allow-downgrade \
-    cairo-devel \
-    numactl \
-    lapack-devel \
-    numactl \
-    gperftools-devel
-
-RUN zypper addrepo -f https://yum.repos.intel.com/oneapi oneAPI && \
-    echo "gpgcheck=0" >> /etc/zypp/repos.d/oneAPI.repo && \
-    echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/oneAPI.repo
-
-RUN zypper install -y intel-oneapi-mkl-2021.1.1 intel-oneapi-mkl-devel-2021.1.1
-
+        cairo-devel \
+        numactl \
+        lapack-devel \
+        gperftools-devel
 
 COPY install_packages.sh .
 
@@ -42,6 +34,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \
 # Set LD_PRELOAD after all required installations to
 # avoid warnings during docker creation
 ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4
-ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
-
-RUN rm -rf /tmp/*
\ No newline at end of file
+ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
\ No newline at end of file
diff --git a/dockerfiles/pytorch/Dockerfile.tencentos3.1 b/dockerfiles/pytorch/Dockerfile.tencentos3.1
index 9a98ea6..4be3703 100644
--- a/dockerfiles/pytorch/Dockerfile.tencentos3.1
+++ b/dockerfiles/pytorch/Dockerfile.tencentos3.1
@@ -20,19 +20,19 @@ LABEL description="Image with pre installed Habanalabs packages for PyTorch"
 ENV PYTHONPATH=/root:/usr/lib/habanalabs/
 
 RUN dnf install -y \
-    curl \
-    cairo-devel \
-    numactl-devel \
-    iproute \
-    which \
-    zlib-devel \
-    lapack-devel \
-    openblas-devel \
-    numactl \
-    pdsh \
-    gcc-toolset-11 \
-    gperftools-devel && \
-    dnf clean all && rm -rf /var/cache/yum
+        curl \
+        cairo-devel \
+        numactl-devel \
+        iproute \
+        which \
+        zlib-devel \
+        lapack-devel \
+        openblas-devel \
+        numactl \
+        pdsh \
+        gcc-toolset-11 \
+        gperftools-devel && \
+    dnf clean all
 
 # Configure GCC 11
 ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:${PATH}
@@ -42,10 +42,6 @@ ENV PCP_DIR=/opt/rh/gcc-toolset-11/root
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64:/opt/rh/gcc-toolset-11/root/usr/lib:/opt/rh/gcc-toolset-11/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-11/root/usr/lib/dyninst:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/opt/rh/gcc-toolset-11/root/usr/lib64/pkgconfig:${PKG_CONFIG_PATH}
 
-RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \
-    dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \
-    dnf clean all && rm -rf /var/cache/yum
-
 COPY install_packages.sh .
 
 RUN ./install_packages.sh && rm -f install_packages.sh && \
diff --git a/dockerfiles/pytorch/Dockerfile.ubuntu b/dockerfiles/pytorch/Dockerfile.ubuntu
index 7f52a20..b3c5a3c 100644
--- a/dockerfiles/pytorch/Dockerfile.ubuntu
+++ b/dockerfiles/pytorch/Dockerfile.ubuntu
@@ -16,23 +16,18 @@ ARG ARTIFACTORY_URL
 ENV PYTHONPATH=/root:/usr/lib/habanalabs/
 
 RUN apt-get update && apt-get install -y \
-    curl \
-    libcurl4 \
-    moreutils \
-    iproute2 \
-    libcairo2-dev \
-    libglib2.0-dev \
-    libhdf5-dev \
-    libselinux1-dev \
-    libnuma-dev \
-    libpcre2-dev \
-    libjpeg-dev \
-    liblapack-dev \
-    libopenblas-dev \
-    numactl \
-    pdsh \
-    libmkl-dev \
-    libgoogle-perftools-dev && \
+        curl \
+        libcurl4 \
+        moreutils \
+        iproute2 \
+        libhdf5-dev \
+        libnuma-dev \
+        libjpeg-dev \
+        liblapack-dev \
+        libopenblas-dev \
+        numactl \
+        pdsh \
+        libgoogle-perftools-dev && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 RUN bash -c "\
@@ -48,6 +43,4 @@ RUN ./install_packages.sh && rm -f install_packages.sh && \
     /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc
 
 ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4
-ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
-
-RUN rm -rf /tmp/*
\ No newline at end of file
+ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768
\ No newline at end of file
diff --git a/dockerfiles/pytorch/install_packages.sh b/dockerfiles/pytorch/install_packages.sh
index 396ab29..4c8f75d 100755
--- a/dockerfiles/pytorch/install_packages.sh
+++ b/dockerfiles/pytorch/install_packages.sh
@@ -16,9 +16,6 @@ case "${BASE_NAME}" in
     *rhel8*)
         os_string="rhel86"
     ;;
-    *amzn2*)
-        os_string="amzn2"
-    ;;
     *tencentos*)
         os_string="tencentos31"
     ;;
diff --git a/utils/check_framework_env.py b/utils/check_framework_env.py
index c12bf28..599f780 100755
--- a/utils/check_framework_env.py
+++ b/utils/check_framework_env.py
@@ -36,7 +36,8 @@ def pytorch_test(device_id=0):
         device_id (int, optional): ID of Intel Gaudi. Defaults to 0.
     """
 
-    os.environ["ID"] = str(device_id)
+    os.environ["HLS_MODULE_ID"] = str(device_id)
+    os.environ["HABANA_VISIBLE_MODULES"] = str(device_id)
 
     try:
         import torch
@@ -52,20 +53,27 @@ def pytorch_test(device_id=0):
         assert y == 4, 'Sanity check failed: Wrong Add output'
         assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card'
     except (RuntimeError, AssertionError) as e:
-        print(f"Card {device_id} Failure: {e}")
+        print(f"Card Module ID {device_id} Failure: {e}")
         raise
 
+    return device_id
 
 if __name__ == '__main__':
     args = parse_arguments()
+    passed_cards = set()
 
-    try:
-        with concurrent.futures.ProcessPoolExecutor() as executor:
-            for device_id, res in zip(range(args.cards), executor.map(pytorch_test, range(args.cards))):
-                print(f"Card {device_id} PASSED")
-    except Exception as e:
-            print(f"Failed to initialize on Intel Gaudi, error: {str(e)}")
-            print(f"Check FAILED")
-            exit(1)
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        futures = [executor.submit(pytorch_test, device_id) for device_id in range(args.cards)]
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                dev_id = future.result()
+                passed_cards.add(dev_id)
+                print(f"Card module_id {dev_id} PASSED")
+
+            except Exception as e:
+                print(f"Failed to initialize on Intel Gaudi, error: {str(e)}")
+
+    failed_cards =  set(range(args.cards)) - passed_cards
 
-    print(f"Check PASSED for {args.cards} cards")
\ No newline at end of file
+    print(f"Failed cards Module ID: {failed_cards}")
+    print(f"Passed cards Module ID: {passed_cards}")
\ No newline at end of file
diff --git a/utils/intel_gaudi_health_screen/IGNodes.py b/utils/intel_gaudi_health_screen/IGNodes.py
index 865a46a..6bebfde 100644
--- a/utils/intel_gaudi_health_screen/IGNodes.py
+++ b/utils/intel_gaudi_health_screen/IGNodes.py
@@ -270,6 +270,7 @@ def check_device_acquire_fail(self):
         self.device_acquire_fail = False
 
         os.environ["ID"] = str(self.module_id)
+        os.environ["HABANA_VISIBLE_MODULES"] = str(self.module_id)
 
         try:
             import torch
diff --git a/utils/intel_gaudi_health_screen/README.md b/utils/intel_gaudi_health_screen/README.md
index f0a537c..143bed4 100644
--- a/utils/intel_gaudi_health_screen/README.md
+++ b/utils/intel_gaudi_health_screen/README.md
@@ -149,7 +149,7 @@ system-info:
   tcp-interface: "10.3.124.0/24"
 
 # Image to run Intel Gaudi Health Screen
-image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
+image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest"
 
 # Node Label used to identify a Intel Gaudi Node
 gaudi-node-label: "habana.ai/gaudi:NoSchedule"
@@ -233,7 +233,7 @@ system-info:
   tcp-interface: "10.3.124.0/24"
 
 # Image to run Intel Gaudi Health Screen
-image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
+image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest"
 
 # Node Label used to identify a Intel Gaudi Node
 gaudi-node-label: "habana.ai/gaudi:NoSchedule"
diff --git a/utils/intel_gaudi_health_screen/config.yaml b/utils/intel_gaudi_health_screen/config.yaml
index f3aef5b..fcac869 100644
--- a/utils/intel_gaudi_health_screen/config.yaml
+++ b/utils/intel_gaudi_health_screen/config.yaml
@@ -12,7 +12,7 @@ system-info:
   tcp-interface: "10.3.124.0/24"
 
 # Image to run Intel Gaudi Health Screen
-image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
+image: "vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest"
 
 # Node Label used to identify a Intel Gaudi Node
 gaudi-node-label: "habana.ai/gaudi:NoSchedule"
diff --git a/utils/intel_gaudi_health_screen/system_utils.py b/utils/intel_gaudi_health_screen/system_utils.py
index 76551eb..fe41b18 100644
--- a/utils/intel_gaudi_health_screen/system_utils.py
+++ b/utils/intel_gaudi_health_screen/system_utils.py
@@ -29,6 +29,12 @@ def __init__(self, image, log_dir, remote_path="/tmp/ighs"):
         self.log_dir     = log_dir
         self.remote_path = remote_path
 
+    def clear_tmp_ighs(self):
+        ighs_path = "/tmp/intel_gaudi_health_screen"
+
+        _logger.info(f"Clearing out {ighs_path}")
+        shutil.rmtree(ighs_path)
+
     def clear_jobs(self):
         if not os.path.exists(self.job_path):
             os.makedirs(self.job_path)
@@ -57,6 +63,7 @@ def __init__(self, image, hostfile, namespace, log_dir):
         self.hostfile = hostfile
 
     def initialize_system(self):
+        self.clear_tmp_ighs()
         self.clear_ighs_pods()
         self.clear_ighs_pods(job_type="mpijobs")
         self.clear_jobs()
@@ -145,6 +152,11 @@ def initialize_node_jobs(self, level,
         return nodes_initialized
 
     def cp_ighs(self, namespace, cwd, metadata_app):
+        def ignore_dirs(dir, contents):
+            return [f for f in contents if os.path.isdir(os.path.join(dir, f)) and f in ["logs", ".git"]]
+
+        shutil.copytree(cwd, f"/tmp/intel_gaudi_health_screen", ignore=ignore_dirs, dirs_exist_ok=True)
+
         pods_done = dict()
         cmd       = f"kubectl get pods -n {namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name,STATUS:.status.phase' --no-headers"
         output    = run_cmd(cmd).strip()
@@ -159,7 +171,7 @@ def cp_ighs(self, namespace, cwd, metadata_app):
             for p in pods:
                 p_name, state = p.split()
                 if p_name not in pods_done and state == "Running":
-                    cmd     = f"kubectl cp -n {namespace} {cwd} {p_name}:/workdir/intel_gaudi_health_screen"
+                    cmd     = f"kubectl cp -n {namespace} /tmp/intel_gaudi_health_screen {p_name}:/workdir/intel_gaudi_health_screen"
                     output  = run_cmd(cmd).strip()
 
                     pods_done[p_name] = True
diff --git a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml
index d1f6941..a79e420 100644
--- a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml
+++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml
@@ -42,7 +42,7 @@ spec:
               while [ ! -d /workdir/intel_gaudi_health_screen ]; do
                 sleep 2s;
               done;
-              sleep 2s;
+              sleep 10s;
 
               cd /workdir/intel_gaudi_health_screen;
               python /workdir/intel_gaudi_health_screen/screen.py --ighs-check node --logs-dir $LOG_DIR;
@@ -72,8 +72,10 @@ spec:
             limits:
               habana.ai/gaudi: 8
               hugepages-2Mi: 29000Mi
+              memory: 200Gi
               cpu: 95
             requests:
               habana.ai/gaudi: 8
               hugepages-2Mi: 29000Mi
+              memory: 200Gi
               cpu: 95
diff --git a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml
index 04c50c0..7078ea0 100644
--- a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml
+++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml
@@ -56,7 +56,7 @@ spec:
                   while [ ! -d /workdir/intel_gaudi_health_screen ]; do
                     sleep 2s;
                   done;
-                    sleep 2s;
+                  sleep 10s;
 
                   declare -xr HOSTSFILE=$OMPI_MCA_orte_default_hostfile;
 
@@ -131,9 +131,11 @@ spec:
                   habana.ai/gaudi: 8
                   hugepages-2Mi: 29000Mi
                   cpu: 95
+                  memory: 200Gi
                 requests:
                   habana.ai/gaudi: 8
                   hugepages-2Mi: 29000Mi
+                  memory: 200Gi
                   cpu: 95
               volumeMounts:
                 - name: mydir
diff --git a/utils/intel_gaudi_health_screen/utilities.py b/utils/intel_gaudi_health_screen/utilities.py
index cfcd893..fb80e63 100644
--- a/utils/intel_gaudi_health_screen/utilities.py
+++ b/utils/intel_gaudi_health_screen/utilities.py
@@ -74,7 +74,7 @@ def create_logger(logger_name, logger_file_name, f_path="", level=logging.INFO,
 
     return t_logger, d_path
 
-def run_cmd(cmd, timeout_s=1_800, verbose=False):
+def run_cmd(cmd, timeout_s=900, verbose=False):
     """ Run Command through subprocess.run()
 
     Args: