Skip to content

Commit

Permalink
Update images used in nvidia and rocm pipeline testing for 2.16 (mast…
Browse files Browse the repository at this point in the history
…er) (red-hat-data-services#2086)

Update images used in nvidia and rocm pipeline testing for 2.16

Use the workbench images availables in 2.16 RC2

Signed-off-by: Jorge Garcia Oncins <[email protected]>
  • Loading branch information
jgarciao authored Dec 3, 2024
1 parent 40d3509 commit 0e5a93e
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10"
"quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852"
)


Expand All @@ -14,11 +14,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
@dsl.component(base_image=common_base_image)
def verify_gpu_availability(gpu_toleration: bool):
import torch
import torch # noqa: PLC0415

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
Expand All @@ -30,7 +28,7 @@ def verify_gpu_availability(gpu_toleration: bool):
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
exec-verify-gpu-availability-2:
container:
args:
Expand All @@ -80,18 +80,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
resources:
accelerator:
count: '1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from kfp.dsl import PipelineTask

# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
# Images for each release can be found here (in the branch for the release)
common_base_image = (
"quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a"
"quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd"
)


Expand All @@ -14,11 +15,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
@dsl.component(base_image=common_base_image)
def verify_gpu_availability(gpu_toleration: bool):
import torch
import torch # noqa: PLC0415

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
Expand All @@ -30,7 +29,7 @@ def verify_gpu_availability(gpu_toleration: bool):
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
exec-verify-gpu-availability-2:
container:
args:
Expand All @@ -80,18 +80,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
resources:
accelerator:
count: '1'
Expand Down

0 comments on commit 0e5a93e

Please sign in to comment.