Skip to content

Commit

Permalink
dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
tmilos77 committed Feb 14, 2025
1 parent 2fe4f8a commit 7e3bb2d
Show file tree
Hide file tree
Showing 10 changed files with 488 additions and 1 deletion.
21 changes: 21 additions & 0 deletions .github/workflows/image-build-main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Cloud-Manager Build Image

on:
push:
branches:
- main
tags:
- "*"

permissions:
id-token: write # This is required for requesting the JWT token
contents: read # This is required for actions/checkout

jobs:
build-image:
uses: kyma-project/test-infra/.github/workflows/image-builder.yml@main
with:
tags: ${{ github.ref_name }}
name: gpu-driver
dockerfile: Dockerfile
export-tags: true
38 changes: 37 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
FROM debian:bookworm-slim
# syntax=docker/dockerfile:1
ARG REGISTRY_PATH=gardenlinux/kmodbuild

FROM debian:bookworm-slim AS packager
ARG TARGET_ARCH
ARG DRIVER_VERSION

COPY resources/scripts/* /opt/nvidia-installer/

RUN apt-get update && apt-get install --no-install-recommends -y \
kmod \
pciutils \
ca-certificates \
wget \
xz-utils

RUN rm -rf /var/lib/apt/lists/*

RUN /opt/nvidia-installer/download_fabricmanager.sh

# Remove several things that are not needed, some of which raise Black Duck scan vulnerabilities
RUN apt-get remove -y --autoremove --allow-remove-essential --ignore-hold \
libgnutls30 apt openssl wget ncurses-base ncurses-bin

RUN rm -rf /var/lib/apt/lists/* /usr/bin/dpkg /sbin/start-stop-daemon /usr/lib/x86_64-linux-gnu/libsystemd.so* \
/var/lib/dpkg/info/libdb5.3* /usr/lib/x86_64-linux-gnu/libdb-5.3.so* /usr/share/doc/libdb5.3 \
/usr/bin/chfn /usr/bin/gpasswd

RUN mkdir -p /rootfs \
&& cp -ar /bin /boot /etc /home /lib /lib64 /media /mnt /opt /root /run /sbin /srv /tmp /usr /var /rootfs \
&& rm -rf /rootfs/opt/actions-runner

FROM scratch

COPY --from=packager /rootfs /

ENTRYPOINT ["/opt/nvidia-installer/entrypoint.sh"]
115 changes: 115 additions & 0 deletions resources/scripts/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/bin/bash
if [ -z "$DRIVER_VERSION" ]; then
echo "Error: DRIVER_VERSION is not set."
exit 1
fi
if [ -z "$KERNEL_NAME" ]; then
echo "Error: KERNEL_NAME is not set."
exit 1
fi

echo "Compiling NVIDIA modules for driver version $DRIVER_VERSION on kernel $KERNEL_NAME"

set -x
mkdir -p /tmp/nvidia

if [ -z "$TARGET_ARCH" ]; then
echo "Error: TARGET_ARCH is not set."
exit 1
fi

declare -A arch_translation
arch_translation=(["amd64"]="x86_64" ["arm64"]="aarch64")

if [[ ! ${arch_translation[$TARGET_ARCH]+_} ]]; then
echo "Error: Unsupported TARGET_ARCH value."
exit 2
fi
ARCH_TYPE=${arch_translation[$TARGET_ARCH]}


# shellcheck disable=SC2164
pushd /tmp/nvidia
DRIVER_URL="https://uk.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-$ARCH_TYPE-$DRIVER_VERSION.run"
if ! curl -Ls "${DRIVER_URL}" -o nvidia.run ; then
echo "Failed to download ${DRIVER_URL}"
exit 1
fi
chmod +x nvidia.run
./nvidia.run -x -s

# shellcheck disable=SC2164
pushd "./NVIDIA-Linux-$ARCH_TYPE-$DRIVER_VERSION"
export IGNORE_MISSING_MODULE_SYMVERS=1
OUTDIR="/out/nvidia/$DRIVER_VERSION"

case $TARGET_ARCH in
amd64)
if ./nvidia-installer \
--no-opengl-files \
--no-libglx-indirect \
--no-install-libglvnd \
--kernel-name="$KERNEL_NAME" \
--no-drm \
--no-install-compat32-libs \
--no-opengl-files \
--ui=none --no-questions \
--no-kernel-module-source \
--no-systemd \
--skip-depmod \
--log-file-name="$PWD"/nvidia-installer.log \
--utility-prefix="$OUTDIR" \
--utility-libdir=lib \
--kernel-install-path="$OUTDIR"/lib/modules/"$KERNEL_NAME" \
&& test -e "$OUTDIR"/lib/modules/"$KERNEL_NAME"/nvidia.ko
then
echo "Successfully compiled NVIDIA modules"
else
echo "[ERROR] Failed to compile NVIDIA modules"
cat "$PWD"/nvidia-installer.log
exit 1
fi
;;
arm64)
if ./nvidia-installer \
--no-opengl-files \
--no-libglx-indirect \
--no-install-libglvnd \
--kernel-name="$KERNEL_NAME" \
--no-drm \
--no-opengl-files \
--no-kernel-module-source \
--ui=none --no-questions \
--no-systemd \
--skip-depmod \
--log-file-name="$PWD"/nvidia-installer.log \
--utility-prefix="$OUTDIR" \
--utility-libdir=lib \
--kernel-install-path="$OUTDIR"/lib/modules/"$KERNEL_NAME" \
&& test -e "$OUTDIR"/lib/modules/"$KERNEL_NAME"/nvidia.ko
then
echo "Successfully compiled NVIDIA modules"
else
echo "[ERROR] Failed to compile NVIDIA modules"
cat /tmp/nvidia/NVIDIA-Linux-aarch64-"$DRIVER_VERSION"/nvidia-installer.log
cat "$PWD"/nvidia-installer.log

exit 1
fi
;;
*)
echo "Unsupported architecture"
exit 3
;;
esac

echo "Archiving assets"

# Archive library .so files
cp /usr/lib/"$ARCH_TYPE"-linux-gnu/*nvidia* /usr/lib/"$ARCH_TYPE"-linux-gnu/*cuda* "$OUTDIR"/lib

# We don't need the installer binaries, or the icons/desktop files in /share
rm -rf "$OUTDIR"/bin/*install* "$OUTDIR"/share

# shellcheck disable=SC2046
tar czf "$OUTDIR".tar.gz --directory $(dirname "$OUTDIR") $(basename "$OUTDIR") && rm -rf "$OUTDIR"
27 changes: 27 additions & 0 deletions resources/scripts/download_fabricmanager.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
echo "Downloading NVIDIA fabric manager for driver version $DRIVER_VERSION"
set -x
DRIVER_BRANCH=$(echo "$DRIVER_VERSION" | grep -oE '^[0-9]+')
if [ -z "$TARGET_ARCH" ]; then
echo "Error: TARGET_ARCH is not set."
exit 1
fi

declare -A arch_translation
arch_translation=(["amd64"]="x86_64" ["arm64"]="aarch64")

if [[ ! ${arch_translation[$TARGET_ARCH]+_} ]]; then
echo "Error: Unsupported TARGET_ARCH value."
exit 2
fi

mkdir -p /tmp/nvidia

# shellcheck disable=SC2164
pushd /tmp/nvidia

# Download Fabric Manager tarball
wget -O /tmp/keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && dpkg -i /tmp/keyring.deb
apt-get update
apt-get install -V nvidia-fabricmanager-"$DRIVER_BRANCH"="$DRIVER_VERSION"-1

30 changes: 30 additions & 0 deletions resources/scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

BIN_DIR=${BIN_DIR:-/opt/nvidia-installer}

driver() {
echo "Driver"
source "${BIN_DIR}/load_install_gpu_driver.sh"
}

fabricManager() {
echo "FabricManager"
source "${BIN_DIR}/install_fabricmanager.sh"
}

case "$1" in
"--driver" )
driver
;;
"--fabricManager" )
fabricManager
;;
"" )
echo "Sleep..."
sleep 1000d
;;
* )
echo "Error: Unknown argument $1"
exit 1
;;
esac
21 changes: 21 additions & 0 deletions resources/scripts/extract_kernel_name.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

# This helper script assumes you have the garden linux repository in your /etc/apt/sources.list configured
#
# This script checks the /usr/src folder for linux-headers-* folders and then figure out the right one to use

kernel_type=$1
if [ "${kernel_type}" == "cloud" ]; then
grep_args="cloud"
else
grep_args="-v cloud"
fi

kernel_arch=$2
# shellcheck disable=SC2010,SC2086
# List the linux-headers folders for the arch & kernel type ------------------- | Sort by line length (shortest first) ---------------- | Pick the first line
kernel_headers=$(ls /usr/src | grep "linux-headers-" | grep "${kernel_arch}" | grep $grep_args | awk '{ print length, $0 }' | sort -n | cut -d" " -f2- | head -n 1)

kernel_name=${kernel_headers//linux-headers-/}

echo "$kernel_name"
43 changes: 43 additions & 0 deletions resources/scripts/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
echo "Installing NVIDIA modules for driver version $DRIVER_VERSION"
set -e

if ${DEBUG}; then
set -x
env
fi

error_out=$(depmod -b "$INSTALL_DIR/$DRIVER_NAME" 2>&1)
# "grep -v ..." removes warnings that do not cause a problem for the gpu driver installation
echo "$error_out" | grep -v 'depmod: WARNING:' || true

modprobe -q -d "$INSTALL_DIR/$DRIVER_NAME" nvidia
modprobe -q -d "$INSTALL_DIR/$DRIVER_NAME" nvidia-uvm
if [ ! -e /dev/nvidia0 ] ; then
NVDEVS=$(lspci | grep -i NVIDIA)
N3D=$(echo "$NVDEVS" | grep -c "3D controller") || true
NVGA=$(echo "$NVDEVS" | grep -c "VGA compatible controller") || true
N=$((N3D + NVGA - 1)) || true
for i in $(seq 0 $N); do mknod -m 666 /dev/nvidia"$i" c 195 "$i"; done
fi
if [ ! -e /dev/nvidiactl ] ; then
mknod -m 666 /dev/nvidiactl c 195 255
fi
if [ ! -e /dev/nvidia-uvm ] ; then
D=$(grep nvidia-uvm /proc/devices | cut -d " " -f 1)
mknod -m 666 /dev/nvidia-uvm c "$D" 0
fi

# For A100 GPUs we install additional device files to support Fabric Manager
GPU_NAME=$("${NVIDIA_BIN}"/nvidia-smi -i 0 --query-gpu=name --format=csv,noheader)
if [[ "$GPU_NAME" == *"A100"* ]]; then
"${NVIDIA_BIN}"/nvidia-modprobe --unified-memory --nvlink
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 0
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 1
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 2
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 3
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 4
"${NVIDIA_BIN}"/nvidia-modprobe --nvswitch -c 5
fi

echo "NVIDIA driver installed OK"
25 changes: 25 additions & 0 deletions resources/scripts/install_fabricmanager.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

export DEBUG=${DEBUG:-false}

if ${DEBUG}; then
set -x
fi

BIN_DIR=${BIN_DIR:-/opt/nvidia-installer}
# shellcheck disable=SC1090
source "$BIN_DIR"/set_env_vars.sh

GPU_NAME=$("${NVIDIA_ROOT}"/bin/nvidia-smi -i 0 --query-gpu=name --format=csv,noheader)

# Typical GPU name is something like "NVIDIA H100 80GB HBM3"
# Fabric manager is required by the newer, bigger GPUs like A100, H100, etc. so we match those GPU types here
if [[ "$GPU_NAME" =~ (A100|H100|H200|B100|B200) ]]; then
sed 's/DAEMONIZE=1/DAEMONIZE=0/g' "/usr/share/nvidia/nvswitch/fabricmanager.cfg" > /etc/fabricmanager.cfg
sed -i 's/LOG_FILE_NAME=.*$/LOG_FILE_NAME=/g' /etc/fabricmanager.cfg

# Run Fabric Manager
nv-fabricmanager -c /etc/fabricmanager.cfg
fi
echo "Sleep infinity"
sleep infinity
Loading

0 comments on commit 7e3bb2d

Please sign in to comment.