diff --git a/app/core/services.py b/app/core/services.py index 52fe516..ab14ffa 100644 --- a/app/core/services.py +++ b/app/core/services.py @@ -279,15 +279,18 @@ def get_system_stats_all(): def get_gpu_stats_all(): - if utils.is_gpu_available(): - gpu_name, total_memory, used_memory, memory_percentage = utils.get_gpu_info() - return { - "gpu_name": gpu_name, - "total_memory": round(total_memory / 1024, 2), - "used_memory": round(used_memory / 1024, 2), - "memory_percentage": memory_percentage, - } - return {} + all_stats = utils.total_gpu_stats() + all_gpus = utils.get_gpu_infos() + + stats = { + "total_memory": all_stats["total_memory"], + "used_memory": all_stats["used_memory"], + "free_memory": all_stats["free_memory"], + "average_utilised_memory": all_stats["average_utilised_memory"], + "average_load": all_stats["average_load"], + "gpus": all_gpus + } + return stats def system_prune(): @@ -300,9 +303,8 @@ def system_prune(): def get_free_total_memory(): if utils.is_gpu_available(): - gpu_values = get_gpu_stats_all() - free_memory = gpu_values["total_memory"] - gpu_values["used_memory"] - return free_memory, gpu_values["total_memory"] + total_gpu = utils.total_gpu_stats() + return total_gpu["free_memory"], total_gpu["total_memory"] else: values = get_system_stats_all() free_memory = values["memory_limit"] - values["memory_usage"] diff --git a/app/core/utils.py b/app/core/utils.py index eaf8e2c..96d1ae0 100644 --- a/app/core/utils.py +++ b/app/core/utils.py @@ -213,24 +213,37 @@ def format_stats(value): return cpu_percentage, memory_usage, memory_limit, memory_percentage -def get_gpu_info(): - nvidia_smi_xml = subprocess.check_output(["nvidia-smi", "-q", "-x"]).decode() - - root = ET.fromstring(nvidia_smi_xml) - - gpu = root.find("gpu") - - gpu_name = gpu.find("product_name").text - total_memory = gpu.find("fb_memory_usage/total").text - used_memory = gpu.find("fb_memory_usage/used").text - - total_memory_value = int(total_memory[:-4]) - used_memory_value = int(used_memory[:-4]) - - mem_percentage = (used_memory_value / total_memory_value) * 100 - - return gpu_name, total_memory_value, used_memory_value, mem_percentage - +def get_gpu_infos(): + if is_gpu_available(): + gpus = GPUtil.getGPUs() + gpu_infos = [] + for gpu in gpus: + info = { + "gpu_name": gpu.name, + "total_memory": round(gpu.memoryTotal / 1024, 2), + "used_memory": round(gpu.memoryUsed / 1024, 2), + "free_memory": round(gpu.memoryFree / 1024, 2), + "utilised_memory": round(gpu.memoryUsed / gpu.memoryTotal, 2), + "load": round(gpu.load, 2) + } + gpu_infos.append(info) + + return gpu_infos + return {} + + +def total_gpu_stats(): + infos = get_gpu_infos() + + total_stats = { + "total_memory": sum(info["total_memory"] for info in infos), + "used_memory": sum(info["used_memory"] for info in infos), + "free_memory": sum(info["free_memory"] for info in infos), + "average_utilised_memory": round(sum(info["utilised_memory"] for info in infos) / len(infos), 2), + "average_load": round(sum(info["load"] for info in infos) / len(infos), 2) + } + + return total_stats cached_domain = None diff --git a/app/schemas.py b/app/schemas.py index f0eec93..f00584a 100644 --- a/app/schemas.py +++ b/app/schemas.py @@ -1,3 +1,4 @@ +from typing import List from pydantic import BaseModel @@ -101,8 +102,18 @@ class OSStatsResponse(BaseModel): storage_percentage: float -class GPUStatsResponse(BaseModel): +class SingleGPUStats(BaseModel): gpu_name: str = None total_memory: float = None used_memory: float = None - memory_percentage: float = None + free_memory: float = None + utilised_memory: float = None + load: float = None + +class GPUStatsResponse(BaseModel): + total_memory: float = None + used_memory: float = None + free_memory: float = None + average_utilised_memory: float = None + average_load: float = None + gpus: List[SingleGPUStats] \ No newline at end of file