Skip to content

Commit

Permalink
Merge pull request #51 from psafont/nolog
Browse files Browse the repository at this point in the history
  • Loading branch information
psafont authored May 8, 2024
2 parents fb3dd57 + e282ce5 commit e27eba5
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 23 deletions.
30 changes: 18 additions & 12 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,45 @@ on:
jobs:
ocaml-test:
name: Ocaml tests
runs-on: ubuntu-20.04
env:
package: "gpumon"
runs-on: ubuntu-22.04

steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v4

- name: Pull configuration from xs-opam
run: |
curl --fail --silent https://raw.githubusercontent.com/xapi-project/xs-opam/master/tools/xs-opam-ci.env | cut -f2 -d " " > .env
- name: Load environment file
id: dotenv
uses: falti/[email protected]
uses: falti/dotenv-action@v1

- name: Update Ubuntu repositories
shell: bash
run: sudo apt-get update

- name: Use ocaml
uses: avsm/setup-ocaml@v1
uses: ocaml/setup-ocaml@v2
with:
ocaml-version: ${{ steps.dotenv.outputs.ocaml_version_full }}
opam-repository: ${{ steps.dotenv.outputs.repository }}
ocaml-compiler: ${{ steps.dotenv.outputs.ocaml_version_full }}
opam-repositories: |
xs-opam: ${{ steps.dotenv.outputs.repository }}
dune-cache: true
env:
DUNE_CACHE_STORAGE_MODE: copy

- name: Install dependencies
run: |
opam pin add . --no-action
opam depext -u ${{ env.package }}
opam install ${{ env.package }} --deps-only --with-test -v
shell: bash
run: opam install . --deps-only --with-test -v

- name: Build
shell: bash
run: |
opam exec -- cp mocks/mock.ml lib/nvml.ml
opam exec -- cp mocks/mock.c stubs/nvml_stubs.c
opam exec -- make
- name: Run tests
shell: bash
run: opam exec -- make test
21 changes: 12 additions & 9 deletions gpumon/gpumon.ml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ let vgpu_config_dir = "/usr/share/nvidia/vgpu"
(* acquire NVML interface but give up after timeout. Note that this does
not attach the NVML libarary but it waits for someone else to attach it
if necessary. *)
let get_nvml_or_wait ?(timeout = 30.0) () =
let get_nvml_or_wait ~log ?(timeout = 30.0) () =
let rec loop delay waited =
match Nvml.NVML.get () with
| Some _ as interface ->
Expand All @@ -28,25 +28,28 @@ let get_nvml_or_wait ?(timeout = 30.0) () =
| true ->
loop 2.0 0.0
| false ->
Process.D.info "%s not found - assuming NVML library unavailable"
vgpu_config_dir ;
if log then
Process.D.info "%s not found - assuming NVML library unavailable"
vgpu_config_dir ;
None

(* Like get_nvml_or_wait but keep looping. Only call this in a thread
that does not block the main interaction like the RPC server. Again,
this is not attaching the library but waits for it to be attached by
some other thread. *)
let get_nvml_or_wait_forever () =
let rec try_again_after delay =
match get_nvml_or_wait () with
let rec try_again_after ~log delay =
match get_nvml_or_wait ~log () with
| Some interface ->
interface
| None ->
Process.D.info "Failed to open NVML interface - retrying in %4.0f" delay ;
if log then
Process.D.info "Failed to open NVML interface - retrying in %4.0f"
delay ;
Thread.delay delay ;
try_again_after (min (delay *. 1.5) (20.0 *. 60.0))
try_again_after ~log:false (min (delay *. 1.5) (20.0 *. 60.0))
in
try_again_after 60.0
try_again_after ~log:true 60.0

let default_config : (int32 * Gpumon_config.config) list =
let open Gpumon_config in
Expand Down Expand Up @@ -359,7 +362,7 @@ let () =
exit 0
in
let module Gpumon_server = Gpumon_server.Make (struct
let interface () = get_nvml_or_wait ()
let interface () = get_nvml_or_wait ~log:true ()
end) in
(* create daemon module to bind server call declarations to implementations *)
let module Daemon = Make (Gpumon_server) in
Expand Down
4 changes: 2 additions & 2 deletions lib/gpumon_config.ml
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ let device_type_of_v2_format = function
( if List.mem_assoc "subsystem_device_id" dict then
List.assoc "subsystem_device_id" dict |> unbox_string >>= id_of_string
>>= fun id -> Ok (Match id)
else
Ok Any
else
Ok Any
)
(* Try to read the list of metrics. *)
>>= fun subsystem_device_id ->
Expand Down

0 comments on commit e27eba5

Please sign in to comment.