Skip to content

Commit

Permalink
support more info (#668)
Browse files Browse the repository at this point in the history
* support more info

---------

Co-authored-by: ZeYi Lin <[email protected]>
  • Loading branch information
Puiching-Memory and Zeyi-Lin authored Aug 11, 2024
1 parent 822a7bb commit fc04910
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 4 deletions.
27 changes: 23 additions & 4 deletions swanlab/data/run/system/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __replace_second_colon(input_string, replacement):
if first_colon_index != -1:
second_colon_index = input_string.find(":", first_colon_index + 1)
if second_colon_index != -1:
return input_string[:second_colon_index] + replacement + input_string[second_colon_index + 1:]
return input_string[:second_colon_index] + replacement + input_string[second_colon_index + 1 :]
return input_string


Expand Down Expand Up @@ -90,13 +90,15 @@ def __get_git_branch_and_commit():

def __get_nvidia_gpu_info():
"""获取 GPU 信息"""
info = {"cores": None, "type": [], "memory": []}
info = {"driver": None, "cores": None, "type": [], "memory": []}
try:
pynvml.nvmlInit()
except:
return None

try:
# 获取 NVIDIA 驱动版本信息
info["driver"] = pynvml.nvmlSystemGetDriverVersion()
# 获取 NVIDIA GPU 数量
info["cores"] = pynvml.nvmlDeviceGetCount()
# 遍历每个 GPU,获取 GPU 信息
Expand All @@ -108,7 +110,7 @@ def __get_nvidia_gpu_info():
gpu_name = gpu_name.decode("utf-8")
info["type"].append(gpu_name)
# 获取 GPU 的总显存, 单位为GB
info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024 ** 3)))
info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3)))

except pynvml.NVMLError as e:
swanlog.debug(f"An error occurred when getting GPU info: {e}")
Expand Down Expand Up @@ -190,7 +192,7 @@ def __get_memory_size():
try:
# 获取系统总内存大小
mem = psutil.virtual_memory()
total_memory = round(mem.total / (1024 ** 3)) # 单位为GB
total_memory = round(mem.total / (1024**3)) # 单位为GB
return total_memory
except Exception as e:
swanlog.debug(f"An error occurred when getting memory size: {e}")
Expand Down Expand Up @@ -226,6 +228,23 @@ def get_requirements() -> str:
return None


def get_conda_env() -> str:
"""获取当前项目下conda环境"""
try:
# 运行pip命令获取当前环境下的环境目录
result = subprocess.run(["conda", "list"], stdout=subprocess.PIPE, text=True)

# 检查命令是否成功运行
if result.returncode == 0:
return result.stdout
else:
swanlog.debug(f"An error occurred when getting conda env:{result.stderr}")
return None
except Exception as e:
swanlog.debug(f"An error occurred when getting conda env: {e}")
return None


def get_system_info(version: str, logdir: str):
"""获取系统信息
:param version: swanlab版本号
Expand Down
39 changes: 39 additions & 0 deletions swanlab/data/run/system/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class SwanSystemMonitor:
memory: 内存数据,包含内存剩余量,使用量和使用率
system_cpu_usage: 系统CPU使用率
process_cpu_usage: 当前进程CPU使用率
system_temperature: 当前系统所有传感器的温度
system_cpu_freq: 当前系统CPU的频率
timestamp: 当前时间
"""

Expand Down Expand Up @@ -182,6 +184,41 @@ def get_process_cpu_usage(self):

return process_cpu_usage


def get_system_cpu_freq(self):
"""
得到当前时间系统CPU的频率
---
信息:
单位: Mhz
精确度: 不确定
可用性: Linux,macOS,Windows,FreeBSD,OpenBSD
WARNING:在linux上,current获取实时值,其他平台上则是固定值
"""
cpu_freq = psutil.cpu_freq(percpu=False)
return cpu_freq

def get_system_temperature(self):
"""
得到当前时间系统各传感器的温度
---
信息:
单位: 摄氏度℃
精确度: 小数点后两位
可用性: Linux,FreeBSD
Return:
temperatures: {"device1":[],"device2":[],...}
TODO:传感器可能的返回参数https://www.kernel.org/doc/html/latest/subsystem-apis.html
已知:k10temp(AMD CPU 10th~16th Opteron~zen3)
"""
if not hasattr(psutil, "sensors_temperatures"):return "platform not supported"
temperatures = psutil.sensors_temperatures()
if not temperatures:return "can't read any temperature"
return temperatures

def get_all(self):
"""获取全部硬件数据"""
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
Expand All @@ -194,6 +231,8 @@ def get_all(self):
"memory": self.get_memory_usage(),
"system_cpu_usage": self.get_system_cpu_usage(),
"process_cpu_usage": self.get_process_cpu_usage(),
"system_temperature":self.get_system_temperature(),
'system_cpu_freq':self.get_system_cpu_freq(),
"timestamp": timestamp,
}

Expand Down

0 comments on commit fc04910

Please sign in to comment.