Skip to content

Commit

Permalink
add gpu memory check
Browse files Browse the repository at this point in the history
  • Loading branch information
phoenixdong committed Jun 7, 2024
1 parent 75c3779 commit 56536fc
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
13 changes: 13 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v2

- name: Check GPU is Free
run: |
chmod +x ./scripts/gpu_check.sh
./scripts/gpu_check.sh
- name: Megatron Unit Test
run: |
Expand All @@ -42,6 +47,10 @@ jobs:
run: |
echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-megatron/index.html"
- name: Check GPU is Free
run: |
./scripts/gpu_check.sh
- name: Flagscale Unit Test
run: |
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-flagscale --cov=flagscale -q -x tests/unit_tests/launcher
Expand All @@ -50,6 +59,10 @@ jobs:
run: |
echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-flagscale/index.html"
- name: Check GPU is Free
run: |
./scripts/gpu_check.sh
- name: Flagscale Functional Test
run: |
python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test
Expand Down
38 changes: 38 additions & 0 deletions scripts/gpu_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

# memory MB

gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

memory_usage_max=30000

while true; do

IFS=$'\n' read -d '' -r -a memory_usage_array <<< "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits)"
IFS=$'\n' read -d '' -r -a memory_total_array <<< "$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits)"

need_wait=false

for ((i=0; i<$gpu_count; i++)); do

memory_usage_i=$((${memory_usage_array[$i]}))
memory_total_i=$((${memory_total_array[$i]}))
memory_remin_i=$(($memory_total_i-$memory_usage_i))

if [ $memory_remin_i -lt $memory_usage_max ]; then
need_wait=true
fi

done

if [ "$need_wait" = false ]; then
break
fi

echo "wait for gpu free"
sleep 5m

unset memory_usage_array
unset memory_total_array

done

0 comments on commit 56536fc

Please sign in to comment.