From 56536fc03658331462e1308fdeb4704d03a337c1 Mon Sep 17 00:00:00 2001 From: phoenixdong Date: Fri, 7 Jun 2024 15:07:01 +0800 Subject: [PATCH] add gpu memory check --- .github/workflows/test.yml | 13 +++++++++++++ scripts/gpu_check.sh | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 scripts/gpu_check.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f36938eee..46f3a4c21 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,11 @@ jobs: steps: - name: Checkout Code uses: actions/checkout@v2 + + - name: Check GPU is Free + run: | + chmod +x ./scripts/gpu_check.sh + ./scripts/gpu_check.sh - name: Megatron Unit Test run: | @@ -42,6 +47,10 @@ jobs: run: | echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-megatron/index.html" + - name: Check GPU is Free + run: | + ./scripts/gpu_check.sh + - name: Flagscale Unit Test run: | torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-flagscale --cov=flagscale -q -x tests/unit_tests/launcher @@ -50,6 +59,10 @@ jobs: run: | echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-flagscale/index.html" + - name: Check GPU is Free + run: | + ./scripts/gpu_check.sh + - name: Flagscale Functional Test run: | python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test diff --git a/scripts/gpu_check.sh b/scripts/gpu_check.sh new file mode 100644 index 000000000..80dd99d86 --- /dev/null +++ b/scripts/gpu_check.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# memory MB + +gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + +memory_usage_max=30000 + +while true; do + + IFS=$'\n' read -d '' -r -a memory_usage_array <<< "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits)" + IFS=$'\n' read -d '' -r -a memory_total_array <<< "$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits)" + + need_wait=false + + for ((i=0; i<$gpu_count; i++)); do + + memory_usage_i=$((${memory_usage_array[$i]})) + memory_total_i=$((${memory_total_array[$i]})) + memory_remin_i=$(($memory_total_i-$memory_usage_i)) + + if [ $memory_remin_i -lt $memory_usage_max ]; then + need_wait=true + fi + + done + + if [ "$need_wait" = false ]; then + break + fi + + echo "wait for gpu free" + sleep 5m + + unset memory_usage_array + unset memory_total_array + +done \ No newline at end of file