From 625897c3f55135342dbe53c32b7650a8ecb86c75 Mon Sep 17 00:00:00 2001 From: Gabriel Moreira Date: Mon, 6 Nov 2023 14:10:12 -0300 Subject: [PATCH] Fix transformer and error on example when CI uses single-GPU (#757) * fix on going errors in transformers unit tests * added range to package versions * add linting changes required by linter * update pre-commit hook action * update file to pass lint action * Fixed bug in end-to-end-session-based that was failing test on CI when only a single GPU was available (which prevented multigpu training) --------- Co-authored-by: Julio Co-authored-by: Julio Perez <37191411+jperez999@users.noreply.github.com> --- .github/workflows/lint.yml | 2 +- ...-based-Yoochoose-multigpu-training-PyT.ipynb | 17 +++++++++++++++-- .../t4r_paper_repro/transf_exp_main.py | 2 +- requirements/base_external.txt | 2 +- transformers4rec/torch/experimental.py | 2 +- 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 016fc4a5b7..01a528855d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -18,4 +18,4 @@ jobs: with: cache: 'pip' cache-dependency-path: '**/**.txt' - - uses: pre-commit/action@v2.0.3 + - uses: pre-commit/action@v3.0.0 diff --git a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb index 3bd729412e..3f502234f8 100644 --- a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb +++ b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb @@ -286,6 +286,19 @@ "- per device batch size for evaluation: see above" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c9e83d47-380c-4118-bc29-8bc108163fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# If only 1 GPU are available, starts a single process to use that GPU\n", + "from torch.cuda import device_count\n", + "num_gpus = device_count()\n", + "NUM_PROCESSES = min(num_gpus, 2)" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -502,7 +515,7 @@ "LR = float(os.environ.get(\"LEARNING_RATE\", \"0.0005\"))\n", "BATCH_SIZE_TRAIN = int(os.environ.get(\"BATCH_SIZE_TRAIN\", \"256\"))\n", "BATCH_SIZE_VALID = int(os.environ.get(\"BATCH_SIZE_VALID\", \"128\"))\n", - "!python -m torch.distributed.run --nproc_per_node 2 {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}" + "!python -m torch.distributed.run --nproc_per_node {NUM_PROCESSES} {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}" ] }, { @@ -554,7 +567,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py b/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py index 5da463b16c..3cd0efedfa 100644 --- a/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py +++ b/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py @@ -224,7 +224,7 @@ def mask_last_interaction(x): logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}") output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt") with open(output_file, "a") as writer: - writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n") + writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n") # Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling): # TODO fix inference discrepancy for permutation language modeling diff --git a/requirements/base_external.txt b/requirements/base_external.txt index b036a28265..b3282044f2 100644 --- a/requirements/base_external.txt +++ b/requirements/base_external.txt @@ -1,4 +1,4 @@ -transformers[torch]>=4.12,<5 +transformers[torch]>=4.12,<4.31.0 tqdm>=4.27 pyarrow>=1.0 torchmetrics>=0.10.0 diff --git a/transformers4rec/torch/experimental.py b/transformers4rec/torch/experimental.py index 38850b6c30..4631c60b9c 100644 --- a/transformers4rec/torch/experimental.py +++ b/transformers4rec/torch/experimental.py @@ -97,7 +97,7 @@ def forward(self, inputs, training=False, testing=False, **kwargs): output = seq_rep + context_rep else: raise ValueError( - f"The aggregation {self.fusion_aggregation} is not supported," + f"The aggregation {self.fusion_aggregation} is not supported, " f"please select one of the following aggregations " f"['concat', 'elementwise-mul', 'elementwise-sum']" )