Skip to content
This repository has been archived by the owner on Dec 4, 2023. It is now read-only.

Commit

Permalink
Support multiple GPUs (#173)
Browse files Browse the repository at this point in the history
* support multiple gpus stats

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* format

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
sekulicd and pre-commit-ci[bot] authored Nov 2, 2023
1 parent 2c17e61 commit 8b38400
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 32 deletions.
37 changes: 25 additions & 12 deletions app/core/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,15 +279,21 @@ def get_system_stats_all():


def get_gpu_stats_all():
if utils.is_gpu_available():
gpu_name, total_memory, used_memory, memory_percentage = utils.get_gpu_info()
return {
"gpu_name": gpu_name,
"total_memory": round(total_memory / 1024, 2),
"used_memory": round(used_memory / 1024, 2),
"memory_percentage": memory_percentage,
}
return {}
if not utils.is_gpu_available():
return {}

all_gpus = utils.get_gpu_infos()
all_stats = utils.total_gpu_stats(all_gpus)

stats = {
"total_memory": all_stats["total_memory"],
"used_memory": all_stats["used_memory"],
"free_memory": all_stats["free_memory"],
"average_utilised_memory": all_stats["average_utilised_memory"],
"average_load": all_stats["average_load"],
"gpus": all_gpus,
}
return stats


def system_prune():
Expand All @@ -300,9 +306,16 @@ def system_prune():

def get_free_total_memory():
if utils.is_gpu_available():
gpu_values = get_gpu_stats_all()
free_memory = gpu_values["total_memory"] - gpu_values["used_memory"]
return free_memory, gpu_values["total_memory"]
total_memory = 0
free_memory = 0

if utils.is_gpu_available():
all_gpu_stats = utils.get_gpu_infos()
total_gpu = utils.total_gpu_stats(all_gpu_stats)
total_memory = total_gpu["free_memory"]
free_memory = total_gpu["total_memory"]

return total_memory, free_memory
else:
values = get_system_stats_all()
free_memory = values["memory_limit"] - values["memory_usage"]
Expand Down
49 changes: 31 additions & 18 deletions app/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pty
import signal
import subprocess
import xml.etree.ElementTree as ET
from http import HTTPStatus

import docker
Expand Down Expand Up @@ -215,23 +214,37 @@ def format_stats(value):
return cpu_percentage, memory_usage, memory_limit, memory_percentage


def get_gpu_info():
nvidia_smi_xml = subprocess.check_output(["nvidia-smi", "-q", "-x"]).decode()

root = ET.fromstring(nvidia_smi_xml)

gpu = root.find("gpu")

gpu_name = gpu.find("product_name").text
total_memory = gpu.find("fb_memory_usage/total").text
used_memory = gpu.find("fb_memory_usage/used").text

total_memory_value = int(total_memory[:-4])
used_memory_value = int(used_memory[:-4])

mem_percentage = (used_memory_value / total_memory_value) * 100

return gpu_name, total_memory_value, used_memory_value, mem_percentage
def get_gpu_infos():
gpus = GPUtil.getGPUs()
gpu_infos = []
for gpu in gpus:
info = {
"gpu_name": gpu.name,
"total_memory": round(gpu.memoryTotal / 1024, 2),
"used_memory": round(gpu.memoryUsed / 1024, 2),
"free_memory": round(gpu.memoryFree / 1024, 2),
"utilised_memory": round(gpu.memoryUsed / gpu.memoryTotal, 2),
"load": round(gpu.load, 2),
}
gpu_infos.append(info)

return gpu_infos


def total_gpu_stats(gpu_infos):
total_stats = {
"total_memory": sum(info["total_memory"] for info in gpu_infos),
"used_memory": sum(info["used_memory"] for info in gpu_infos),
"free_memory": sum(info["free_memory"] for info in gpu_infos),
"average_utilised_memory": round(
sum(info["utilised_memory"] for info in gpu_infos) / len(gpu_infos), 2
),
"average_load": round(
sum(info["load"] for info in gpu_infos) / len(gpu_infos), 2
),
}

return total_stats


cached_domain = None
Expand Down
15 changes: 13 additions & 2 deletions app/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,19 @@ class OSStatsResponse(BaseModel):
storage_percentage: float


class GPUStatsResponse(BaseModel):
class SingleGPUStats(BaseModel):
gpu_name: str = None
total_memory: float = None
used_memory: float = None
memory_percentage: float = None
free_memory: float = None
utilised_memory: float = None
load: float = None


class GPUStatsResponse(BaseModel):
total_memory: float = None
used_memory: float = None
free_memory: float = None
average_utilised_memory: float = None
average_load: float = None
gpus: list[SingleGPUStats] = None

0 comments on commit 8b38400

Please sign in to comment.