From 8f7d06a70d995a5f7223d4bd432fcf38c2312166 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 12:39:28 +0800 Subject: [PATCH 1/7] Ignore Shell files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2c949384..03b064f5 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,5 @@ cython_debug/ rsc/**/*.tif **/*/lightning_logs -*.zip \ No newline at end of file +*.zip +*.sh \ No newline at end of file From 6cdee0c29e5d4042dc0b98fce8f0a61f9b46a130 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 12:43:42 +0800 Subject: [PATCH 2/7] Add check for GPU availability --- .github/workflows/model.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index 902848da..df2bfa28 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -12,6 +12,7 @@ jobs: volumes: - /home/runner/work/frdc-ml/_github_home:/root env: + # This is where setup-python will install and cache the venv AGENT_TOOLSDIRECTORY: "/root/venv" steps: @@ -34,7 +35,11 @@ jobs: pip3 install -r requirements.txt pip3 install torch torchvision torchaudio - - name: Set up gcloud + - name: Check torch.cuda.is_available + run: | + python3 -c 'import torch; torch.cuda.is_available()' && exit 0 || exit 1 + + - name: Auth gcloud id: 'auth' uses: 'google-github-actions/auth@v1' with: @@ -47,10 +52,12 @@ jobs: run: | echo "WANDB_API_KEY=${{ secrets.WANDB_API_KEY }}" >> $GITHUB_ENV + # Our project has src as a source path, explicitly add that in. - name: Add src as PYTHONPATH run: | echo "PYTHONPATH=src" >> $GITHUB_ENV + # Do not do cd as it'll break PYTHONPATH. - name: Run Model Training run: | python3 -m tests.model_tests.chestnut_dec_may.main From 7e368686ccfbae438566168606e00435cf8400f8 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 12:55:44 +0800 Subject: [PATCH 3/7] Re-cache venv/ and force lightning to use gpu --- .github/workflows/model.yml | 1 + tests/model_tests/chestnut_dec_may/main.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index df2bfa28..1805699c 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -60,6 +60,7 @@ jobs: # Do not do cd as it'll break PYTHONPATH. - name: Run Model Training run: | + git config --global --add safe.directory /__w/FRDC-ML/FRDC-ML' python3 -m tests.model_tests.chestnut_dec_may.main - name: Comment results via CML diff --git a/tests/model_tests/chestnut_dec_may/main.py b/tests/model_tests/chestnut_dec_may/main.py index 3e2fda79..f33e408f 100644 --- a/tests/model_tests/chestnut_dec_may/main.py +++ b/tests/model_tests/chestnut_dec_may/main.py @@ -84,7 +84,7 @@ def train_val_test_split( # TODO: Though this is set, the results are still not reproducible. deterministic=True, # fast_dev_run=True, - accelerator="cpu", + accelerator="gpu", log_every_n_steps=4, callbacks=[ # Stop training if the validation loss doesn't improve for 4 epochs From a8fb690b475a5a7657785cef3099097d7bbf6793 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 13:03:24 +0800 Subject: [PATCH 4/7] Fix bad shell cmd --- .github/workflows/model.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index 1805699c..df2bfa28 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -60,7 +60,6 @@ jobs: # Do not do cd as it'll break PYTHONPATH. - name: Run Model Training run: | - git config --global --add safe.directory /__w/FRDC-ML/FRDC-ML' python3 -m tests.model_tests.chestnut_dec_may.main - name: Comment results via CML From 36836e8a512a5e1ebc0c2d9265cfb7d6c9e615ea Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 13:05:51 +0800 Subject: [PATCH 5/7] Add sleep for debugging --- .github/workflows/model.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index df2bfa28..53bba981 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -60,6 +60,7 @@ jobs: # Do not do cd as it'll break PYTHONPATH. - name: Run Model Training run: | + sleep 1000 python3 -m tests.model_tests.chestnut_dec_may.main - name: Comment results via CML From 64c987720e01683b02feb557f0e4171e074ea78c Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 13:17:28 +0800 Subject: [PATCH 6/7] Debug session --- .github/workflows/model.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index 53bba981..fe43ad1a 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -57,6 +57,9 @@ jobs: run: | echo "PYTHONPATH=src" >> $GITHUB_ENV + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + # Do not do cd as it'll break PYTHONPATH. - name: Run Model Training run: | From 9aa4f78fe192c4164badbb0a736b148c3039d2d0 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 24 Nov 2023 13:24:37 +0800 Subject: [PATCH 7/7] Fix GPU not mounted --- .github/workflows/model.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index fe43ad1a..95d8b953 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -14,6 +14,7 @@ jobs: env: # This is where setup-python will install and cache the venv AGENT_TOOLSDIRECTORY: "/root/venv" + options: --gpus all steps: - uses: actions/checkout@v3 @@ -35,9 +36,8 @@ jobs: pip3 install -r requirements.txt pip3 install torch torchvision torchaudio - - name: Check torch.cuda.is_available - run: | - python3 -c 'import torch; torch.cuda.is_available()' && exit 0 || exit 1 + - name: Check CUDA is available + run: nvidia-smi - name: Auth gcloud id: 'auth' @@ -57,13 +57,9 @@ jobs: run: | echo "PYTHONPATH=src" >> $GITHUB_ENV - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - # Do not do cd as it'll break PYTHONPATH. - name: Run Model Training run: | - sleep 1000 python3 -m tests.model_tests.chestnut_dec_may.main - name: Comment results via CML