From f20697d45d621c9a2363bab459a3994384571132 Mon Sep 17 00:00:00 2001 From: Roman Bredehoft Date: Tue, 6 Aug 2024 19:18:42 +0200 Subject: [PATCH] chore: fix remote embedding and lm_head --- .../lora_finetune/gpt2_finetune_hybrid.ipynb | 119 +++++------------- .../lora_finetune/lora_module.py | 3 + 2 files changed, 33 insertions(+), 89 deletions(-) diff --git a/use_case_examples/lora_finetune/gpt2_finetune_hybrid.ipynb b/use_case_examples/lora_finetune/gpt2_finetune_hybrid.ipynb index 1a23107677..3ee688f37c 100644 --- a/use_case_examples/lora_finetune/gpt2_finetune_hybrid.ipynb +++ b/use_case_examples/lora_finetune/gpt2_finetune_hybrid.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 100, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -120,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -175,13 +175,13 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n", "\n", - "EPOCHS = 2\n", + "EPOCHS = 100\n", "PER_DEVICE_TRAIN_BATCH_SIZE = 4\n", "\n", "training_args = TrainingArguments(\n", @@ -204,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -229,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,9 @@ " # hybrid models). We however still need to include the associated module's forward pass in\n", " # the hybrid model\n", " # Also include the embedding and language model head as they represent a lot of the model's\n", - " # parameters\n", + " # parameters. Side note: \"lm_head\" does not appear in model.parameters() because the weights\n", + " # are directly tied to the embedding ones, but we still need to remove both modules in\n", + " # order to get rid of the weights\n", " if isinstance(module, (Conv1D, Embedding)) or \"lm_head\" in name:\n", " remote_names.append(name)\n", "\n", @@ -257,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -266,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -282,24 +284,9 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Training Progress: 0%| | 0/100 [3:52:58" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Avoid the following error from HuggingFace when training :\n", "# \"The current process just got forked, after parallelism has already been used. Disabling\n", @@ -458,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -471,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -490,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -513,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -537,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -555,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -570,7 +511,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -588,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/use_case_examples/lora_finetune/lora_module.py b/use_case_examples/lora_finetune/lora_module.py index 6fde7d0471..41211adba5 100644 --- a/use_case_examples/lora_finetune/lora_module.py +++ b/use_case_examples/lora_finetune/lora_module.py @@ -35,6 +35,9 @@ def forward(self, inputs): loss = loss / self.gradient_accumulation_steps # Update gradients + # We need to set requires grad to the loss manually because the inference model's last + # step is the "lm_head" layer, which is detached from the graph by the hybrid model + loss.requires_grad_(True) loss.backward() grad_norm = None