generated from kyma-project/template-repository
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
488 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: Cloud-Manager Build Image | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
tags: | ||
- "*" | ||
|
||
permissions: | ||
id-token: write # This is required for requesting the JWT token | ||
contents: read # This is required for actions/checkout | ||
|
||
jobs: | ||
build-image: | ||
uses: kyma-project/test-infra/.github/workflows/image-builder.yml@main | ||
with: | ||
tags: ${{ github.ref_name }} | ||
name: gpu-driver | ||
dockerfile: Dockerfile | ||
export-tags: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,37 @@ | ||
FROM debian:bookworm-slim | ||
# syntax=docker/dockerfile:1 | ||
ARG REGISTRY_PATH=gardenlinux/kmodbuild | ||
|
||
FROM debian:bookworm-slim AS packager | ||
ARG TARGET_ARCH | ||
ARG DRIVER_VERSION | ||
|
||
COPY resources/scripts/* /opt/nvidia-installer/ | ||
|
||
RUN apt-get update && apt-get install --no-install-recommends -y \ | ||
kmod \ | ||
pciutils \ | ||
ca-certificates \ | ||
wget \ | ||
xz-utils | ||
|
||
RUN rm -rf /var/lib/apt/lists/* | ||
|
||
RUN /opt/nvidia-installer/download_fabricmanager.sh | ||
|
||
# Remove several things that are not needed, some of which raise Black Duck scan vulnerabilities | ||
RUN apt-get remove -y --autoremove --allow-remove-essential --ignore-hold \ | ||
libgnutls30 apt openssl wget ncurses-base ncurses-bin | ||
|
||
RUN rm -rf /var/lib/apt/lists/* /usr/bin/dpkg /sbin/start-stop-daemon /usr/lib/x86_64-linux-gnu/libsystemd.so* \ | ||
/var/lib/dpkg/info/libdb5.3* /usr/lib/x86_64-linux-gnu/libdb-5.3.so* /usr/share/doc/libdb5.3 \ | ||
/usr/bin/chfn /usr/bin/gpasswd | ||
|
||
RUN mkdir -p /rootfs \ | ||
&& cp -ar /bin /boot /etc /home /lib /lib64 /media /mnt /opt /root /run /sbin /srv /tmp /usr /var /rootfs \ | ||
&& rm -rf /rootfs/opt/actions-runner | ||
|
||
FROM scratch | ||
|
||
COPY --from=packager /rootfs / | ||
|
||
ENTRYPOINT ["/opt/nvidia-installer/entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/bin/bash | ||
if [ -z "$DRIVER_VERSION" ]; then | ||
echo "Error: DRIVER_VERSION is not set." | ||
exit 1 | ||
fi | ||
if [ -z "$KERNEL_NAME" ]; then | ||
echo "Error: KERNEL_NAME is not set." | ||
exit 1 | ||
fi | ||
|
||
echo "Compiling NVIDIA modules for driver version $DRIVER_VERSION on kernel $KERNEL_NAME" | ||
|
||
set -x | ||
mkdir -p /tmp/nvidia | ||
|
||
if [ -z "$TARGET_ARCH" ]; then | ||
echo "Error: TARGET_ARCH is not set." | ||
exit 1 | ||
fi | ||
|
||
declare -A arch_translation | ||
arch_translation=(["amd64"]="x86_64" ["arm64"]="aarch64") | ||
|
||
if [[ ! ${arch_translation[$TARGET_ARCH]+_} ]]; then | ||
echo "Error: Unsupported TARGET_ARCH value." | ||
exit 2 | ||
fi | ||
ARCH_TYPE=${arch_translation[$TARGET_ARCH]} | ||
|
||
|
||
# shellcheck disable=SC2164 | ||
pushd /tmp/nvidia | ||
DRIVER_URL="https://uk.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-$ARCH_TYPE-$DRIVER_VERSION.run" | ||
if ! curl -Ls "${DRIVER_URL}" -o nvidia.run ; then | ||
echo "Failed to download ${DRIVER_URL}" | ||
exit 1 | ||
fi | ||
chmod +x nvidia.run | ||
./nvidia.run -x -s | ||
|
||
# shellcheck disable=SC2164 | ||
pushd "./NVIDIA-Linux-$ARCH_TYPE-$DRIVER_VERSION" | ||
export IGNORE_MISSING_MODULE_SYMVERS=1 | ||
OUTDIR="/out/nvidia/$DRIVER_VERSION" | ||
|
||
case $TARGET_ARCH in | ||
amd64) | ||
if ./nvidia-installer \ | ||
--no-opengl-files \ | ||
--no-libglx-indirect \ | ||
--no-install-libglvnd \ | ||
--kernel-name="$KERNEL_NAME" \ | ||
--no-drm \ | ||
--no-install-compat32-libs \ | ||
--no-opengl-files \ | ||
--ui=none --no-questions \ | ||
--no-kernel-module-source \ | ||
--no-systemd \ | ||
--skip-depmod \ | ||
--log-file-name="$PWD"/nvidia-installer.log \ | ||
--utility-prefix="$OUTDIR" \ | ||
--utility-libdir=lib \ | ||
--kernel-install-path="$OUTDIR"/lib/modules/"$KERNEL_NAME" \ | ||
&& test -e "$OUTDIR"/lib/modules/"$KERNEL_NAME"/nvidia.ko | ||
then | ||
echo "Successfully compiled NVIDIA modules" | ||
else | ||
echo "[ERROR] Failed to compile NVIDIA modules" | ||
cat "$PWD"/nvidia-installer.log | ||
exit 1 | ||
fi | ||
;; | ||
arm64) | ||
if ./nvidia-installer \ | ||
--no-opengl-files \ | ||
--no-libglx-indirect \ | ||
--no-install-libglvnd \ | ||
--kernel-name="$KERNEL_NAME" \ | ||
--no-drm \ | ||
--no-opengl-files \ | ||
--no-kernel-module-source \ | ||
--ui=none --no-questions \ | ||
--no-systemd \ | ||
--skip-depmod \ | ||
--log-file-name="$PWD"/nvidia-installer.log \ | ||
--utility-prefix="$OUTDIR" \ | ||
--utility-libdir=lib \ | ||
--kernel-install-path="$OUTDIR"/lib/modules/"$KERNEL_NAME" \ | ||
&& test -e "$OUTDIR"/lib/modules/"$KERNEL_NAME"/nvidia.ko | ||
then | ||
echo "Successfully compiled NVIDIA modules" | ||
else | ||
echo "[ERROR] Failed to compile NVIDIA modules" | ||
cat /tmp/nvidia/NVIDIA-Linux-aarch64-"$DRIVER_VERSION"/nvidia-installer.log | ||
cat "$PWD"/nvidia-installer.log | ||
|
||
exit 1 | ||
fi | ||
;; | ||
*) | ||
echo "Unsupported architecture" | ||
exit 3 | ||
;; | ||
esac | ||
|
||
echo "Archiving assets" | ||
|
||
# Archive library .so files | ||
cp /usr/lib/"$ARCH_TYPE"-linux-gnu/*nvidia* /usr/lib/"$ARCH_TYPE"-linux-gnu/*cuda* "$OUTDIR"/lib | ||
|
||
# We don't need the installer binaries, or the icons/desktop files in /share | ||
rm -rf "$OUTDIR"/bin/*install* "$OUTDIR"/share | ||
|
||
# shellcheck disable=SC2046 | ||
tar czf "$OUTDIR".tar.gz --directory $(dirname "$OUTDIR") $(basename "$OUTDIR") && rm -rf "$OUTDIR" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/bin/bash | ||
echo "Downloading NVIDIA fabric manager for driver version $DRIVER_VERSION" | ||
set -x | ||
DRIVER_BRANCH=$(echo "$DRIVER_VERSION" | grep -oE '^[0-9]+') | ||
if [ -z "$TARGET_ARCH" ]; then | ||
echo "Error: TARGET_ARCH is not set." | ||
exit 1 | ||
fi | ||
|
||
declare -A arch_translation | ||
arch_translation=(["amd64"]="x86_64" ["arm64"]="aarch64") | ||
|
||
if [[ ! ${arch_translation[$TARGET_ARCH]+_} ]]; then | ||
echo "Error: Unsupported TARGET_ARCH value." | ||
exit 2 | ||
fi | ||
|
||
mkdir -p /tmp/nvidia | ||
|
||
# shellcheck disable=SC2164 | ||
pushd /tmp/nvidia | ||
|
||
# Download Fabric Manager tarball | ||
wget -O /tmp/keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && dpkg -i /tmp/keyring.deb | ||
apt-get update | ||
apt-get install -V nvidia-fabricmanager-"$DRIVER_BRANCH"="$DRIVER_VERSION"-1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
|
||
BIN_DIR=${BIN_DIR:-/opt/nvidia-installer} | ||
|
||
driver() { | ||
echo "Driver" | ||
source "${BIN_DIR}/load_install_gpu_driver.sh" | ||
} | ||
|
||
fabricManager() { | ||
echo "FabricManager" | ||
source "${BIN_DIR}/install_fabricmanager.sh" | ||
} | ||
|
||
case "$1" in | ||
"--driver" ) | ||
driver | ||
;; | ||
"--fabricManager" ) | ||
fabricManager | ||
;; | ||
"" ) | ||
echo "Sleep..." | ||
sleep 1000d | ||
;; | ||
* ) | ||
echo "Error: Unknown argument $1" | ||
exit 1 | ||
;; | ||
esac |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
|
||
# This helper script assumes you have the garden linux repository in your /etc/apt/sources.list configured | ||
# | ||
# This script checks the /usr/src folder for linux-headers-* folders and then figure out the right one to use | ||
|
||
kernel_type=$1 | ||
if [ "${kernel_type}" == "cloud" ]; then | ||
grep_args="cloud" | ||
else | ||
grep_args="-v cloud" | ||
fi | ||
|
||
kernel_arch=$2 | ||
# shellcheck disable=SC2010,SC2086 | ||
# List the linux-headers folders for the arch & kernel type ------------------- | Sort by line length (shortest first) ---------------- | Pick the first line | ||
kernel_headers=$(ls /usr/src | grep "linux-headers-" | grep "${kernel_arch}" | grep $grep_args | awk '{ print length, $0 }' | sort -n | cut -d" " -f2- | head -n 1) | ||
|
||
kernel_name=${kernel_headers//linux-headers-/} | ||
|
||
echo "$kernel_name" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/bash | ||
echo "Installing NVIDIA modules for driver version $DRIVER_VERSION" | ||
set -e | ||
|
||
if ${DEBUG}; then | ||
set -x | ||
env | ||
fi | ||
|
||
error_out=$(depmod -b "$INSTALL_DIR/$DRIVER_NAME" 2>&1) | ||
# "grep -v ..." removes warnings that do not cause a problem for the gpu driver installation | ||
echo "$error_out" | grep -v 'depmod: WARNING:' || true | ||
|
||
modprobe -q -d "$INSTALL_DIR/$DRIVER_NAME" nvidia | ||
modprobe -q -d "$INSTALL_DIR/$DRIVER_NAME" nvidia-uvm | ||
if [ ! -e /dev/nvidia0 ] ; then | ||
NVDEVS=$(lspci | grep -i NVIDIA) | ||
N3D=$(echo "$NVDEVS" | grep -c "3D controller") || true | ||
NVGA=$(echo "$NVDEVS" | grep -c "VGA compatible controller") || true | ||
N=$((N3D + NVGA - 1)) || true | ||
for i in $(seq 0 $N); do mknod -m 666 /dev/nvidia"$i" c 195 "$i"; done | ||
fi | ||
if [ ! -e /dev/nvidiactl ] ; then | ||
mknod -m 666 /dev/nvidiactl c 195 255 | ||
fi | ||
if [ ! -e /dev/nvidia-uvm ] ; then | ||
D=$(grep nvidia-uvm /proc/devices | cut -d " " -f 1) | ||
mknod -m 666 /dev/nvidia-uvm c "$D" 0 | ||
fi | ||
|
||
# For A100 GPUs we install additional device files to support Fabric Manager | ||
GPU_NAME=$("${NVIDIA_BIN}"/nvidia-smi -i 0 --query-gpu=name --format=csv,noheader) | ||
if [[ "$GPU_NAME" == *"A100"* ]]; then | ||
"${NVIDIA_BIN}"/nvidia-modprobe --unified-memory --nvlink | ||
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 0 | ||
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 1 | ||
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 2 | ||
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 3 | ||
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 4 | ||
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 5 | ||
fi | ||
|
||
echo "NVIDIA driver installed OK" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/bin/bash | ||
|
||
export DEBUG=${DEBUG:-false} | ||
|
||
if ${DEBUG}; then | ||
set -x | ||
fi | ||
|
||
BIN_DIR=${BIN_DIR:-/opt/nvidia-installer} | ||
# shellcheck disable=SC1090 | ||
source "$BIN_DIR"/set_env_vars.sh | ||
|
||
GPU_NAME=$("${NVIDIA_ROOT}"/bin/nvidia-smi -i 0 --query-gpu=name --format=csv,noheader) | ||
|
||
# Typical GPU name is something like "NVIDIA H100 80GB HBM3" | ||
# Fabric manager is required by the newer, bigger GPUs like A100, H100, etc. so we match those GPU types here | ||
if [[ "$GPU_NAME" =~ (A100|H100|H200|B100|B200) ]]; then | ||
sed 's/DAEMONIZE=1/DAEMONIZE=0/g' "/usr/share/nvidia/nvswitch/fabricmanager.cfg" > /etc/fabricmanager.cfg | ||
sed -i 's/LOG_FILE_NAME=.*$/LOG_FILE_NAME=/g' /etc/fabricmanager.cfg | ||
|
||
# Run Fabric Manager | ||
nv-fabricmanager -c /etc/fabricmanager.cfg | ||
fi | ||
echo "Sleep infinity" | ||
sleep infinity |
Oops, something went wrong.