3
3
from subprocess import check_output
4
4
import re
5
5
import docker
6
+ from py3nvml .py3nvml import *
6
7
7
8
class NVDockerClient :
8
9
10
+ nvml_initialized = False
11
+
9
12
def __init__ (self ):
10
13
self .docker_client = docker .from_env (version = "auto" )
14
+ NVDockerClient .__check_nvml_init ()
15
+
16
+ """
17
+ Private method to check if nvml is loaded (and load the library if it isn't loaded)
18
+ """
19
+ def __check_nvml_init ():
20
+ if not NVDockerClient .nvml_initialized :
21
+ nvmlInit ()
22
+ print ("NVIDIA Driver Version:" , nvmlSystemGetDriverVersion ())
23
+ NVDockerClient .nvml_initialized = True
11
24
12
25
#TODO: Testing on MultiGPU
13
26
def create_container (self , image , ** kwargs ):
@@ -151,28 +164,37 @@ def exec_run(self, cid, cmd):
151
164
return c .exec_run (cmd )
152
165
153
166
@staticmethod
154
- def list_gpus ():
155
- output = check_output ([ "nvidia-smi" , "-L" ]). decode ( "utf-8" )
156
- regex = re . compile ( r"GPU (?P<id>\d+):" )
157
- gpus = []
158
- for line in output . strip (). split ( " \n " ):
159
- m = regex . match ( line )
160
- assert m , "unable to parse " + line
161
- gpus . append ( int ( m . group ( "id" )))
167
+ def gpu_info ():
168
+ NVDockerClient . __check_nvml_init ()
169
+ gpus = {}
170
+ num_gpus = nvmlDeviceGetCount ()
171
+ for i in range ( num_gpus ):
172
+ gpu_handle = nvmlDeviceGetHandleByIndex ( i )
173
+ gpu_name = nvmlDeviceGetName ( gpu_handle )
174
+ gpus [ i ] = { "gpu_handle" : gpu_handle , "gpu_name" : gpu_name }
162
175
return gpus
163
176
164
177
@staticmethod
165
- def gpu_memory_usage ():
166
- output = check_output (["nvidia-smi" ]).decode ("utf-8" )
167
- smi_output = output [output .find ("GPU Memory" ):]
168
- rows = smi_output .split ("\n " )
169
- regex = re .compile (r"[|]\s+?(?P<id>\d+)\D+?(?P<pid>\d+).+[ ](?P<usage>\d+)MiB" )
170
- usage = {gpu_id : 0 for gpu_id in NVDockerClient .list_gpus ()}
171
- for row in smi_output .split ("\n " ):
172
- gpu = regex .search (row )
173
- if not gpu :
174
- continue
175
- id = int (gpu .group ("id" ))
176
- memory = int (gpu .group ("usage" ))
177
- usage [id ] += memory
178
- return usage
178
+ def gpu_memory_usage (id ):
179
+ gpus = NVDockerClient .gpu_info ()
180
+ if id not in gpus .keys ():
181
+ return None
182
+ gpu_handle = gpus [id ]["gpu_handle" ]
183
+ gpu_memory_data = nvmlDeviceGetMemoryInfo (gpu_handle )
184
+ rv = {}
185
+ #returns in megabytes
186
+ rv ["used_mb" ] = gpu_memory_data .used / 1e6
187
+ rv ["free_mb" ] = gpu_memory_data .free / 1e6
188
+ return rv
189
+
190
+ @staticmethod
191
+ def least_used_gpu ():
192
+ gpus = NVDockerClient .gpu_info ()
193
+ lowest_key = None ;
194
+ lowest_used_memory = 1e9 ;
195
+ for id in gpus .keys ():
196
+ memory = NVDockerClient .gpu_memory_usage (id )["used_mb" ]
197
+ if lowest_key is None or memory < lowest_used_memory :
198
+ lowest_key = id
199
+ lowest_used_memory = memory
200
+ return lowest_key
0 commit comments