From aea88b5671c8e70c3e3bda2f3e5c95734ac86f1a Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 14 Nov 2024 12:59:14 +0200
Subject: [PATCH 1/6] Use AutoConfig

---
 thunder/tests/test_networks.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index 36ba5c3cd3..78e0f58dca 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -403,20 +403,17 @@ def test_thunderfx_mistral_nemo_small():
 @thunder.tests.framework.requiresCUDA
 def test_hf_qwen2():
     from thunder.dynamo import ThunderCompiler
-    from transformers import Qwen2Config, Qwen2ForCausalLM
+    from transformers import AutoConfig, Qwen2ForCausalLM
 
     # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
-    configuration = Qwen2Config(
-        # Qwen2.5-7B-Instruct uses Grouped-Query Attention, while the default
-        # config uses Multi-Head Attention
-        num_attention_heads=28,
-        num_key_value_heads=4,
+    configuration = AutoConfig.from_pretrained(
+        "Qwen/Qwen2.5-7B-Instruct",
         # Scaled down for testing
         hidden_size=56,
         vocab_size=16,
         max_position_embeddings=32,
+        num_hidden_layers=1,
     )
-    configuration.num_hidden_layers = 1
     with torch.device("cuda"):
         model = Qwen2ForCausalLM(configuration).to(torch.bfloat16)
 

From 3cb7b17e30333346850385c9169519fca7e04278 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:03:19 +0200
Subject: [PATCH 2/6] Use AutoModelForCausalLM.from_config

---
 thunder/tests/test_networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index 78e0f58dca..2bfc6d44ad 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -403,7 +403,7 @@ def test_thunderfx_mistral_nemo_small():
 @thunder.tests.framework.requiresCUDA
 def test_hf_qwen2():
     from thunder.dynamo import ThunderCompiler
-    from transformers import AutoConfig, Qwen2ForCausalLM
+    from transformers import AutoConfig, AutoModelForCausalLM
 
     # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
     configuration = AutoConfig.from_pretrained(
@@ -415,7 +415,7 @@ def test_hf_qwen2():
         num_hidden_layers=1,
     )
     with torch.device("cuda"):
-        model = Qwen2ForCausalLM(configuration).to(torch.bfloat16)
+        model = AutoModelForCausalLM.from_config(configuration).to(torch.bfloat16)
 
     # thunder.jit doesn't work with Qwen2, so we use torch.compile
     # https://github.com/Lightning-AI/lightning-thunder/issues/1405

From ac1e4bc6e91d6d9454f930bc78581dbcb1327ed0 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:09:16 +0200
Subject: [PATCH 3/6] pad_token_id should be within vocab_size; make
 hidden_size the same as num_attention_heads

---
 thunder/tests/test_networks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index 2bfc6d44ad..c8084c8fc2 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -409,11 +409,12 @@ def test_hf_qwen2():
     configuration = AutoConfig.from_pretrained(
         "Qwen/Qwen2.5-7B-Instruct",
         # Scaled down for testing
-        hidden_size=56,
         vocab_size=16,
+        pad_token_id=15,
         max_position_embeddings=32,
         num_hidden_layers=1,
     )
+    configuration.hidden_size = configuration.num_attention_heads
     with torch.device("cuda"):
         model = AutoModelForCausalLM.from_config(configuration).to(torch.bfloat16)
 

From 27eaa2c0c702a644b628899ffad035961500dcaf Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:43:45 +0200
Subject: [PATCH 4/6] Parametrize the test over model_id

---
 thunder/tests/test_networks.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index c8084c8fc2..4ecb06b9a6 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -401,13 +401,14 @@ def test_thunderfx_mistral_nemo_small():
 
 
 @thunder.tests.framework.requiresCUDA
-def test_hf_qwen2():
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2.5-7B-Instruct", "microsoft/Phi-3-mini-128k-instruct"])
+def test_hf_for_nemo(model_id):
     from thunder.dynamo import ThunderCompiler
     from transformers import AutoConfig, AutoModelForCausalLM
 
     # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
     configuration = AutoConfig.from_pretrained(
-        "Qwen/Qwen2.5-7B-Instruct",
+        model_id,
         # Scaled down for testing
         vocab_size=16,
         pad_token_id=15,
@@ -420,8 +421,12 @@ def test_hf_qwen2():
 
     # thunder.jit doesn't work with Qwen2, so we use torch.compile
     # https://github.com/Lightning-AI/lightning-thunder/issues/1405
+
+    # fullgraph=True used to work with transformers 4.45.2, but it doesn't work
+    # with 4.46.2 because of re.findall usage in the loss function
+    fullgraph = False
     backend = ThunderCompiler()
-    compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+    compiled_model = torch.compile(model, backend=backend, fullgraph=fullgraph)
 
     input_ids = torch.randint(0, configuration.vocab_size, (1, configuration.max_position_embeddings), device="cuda")
     ref_output = model(input_ids=input_ids, labels=input_ids)
@@ -435,7 +440,8 @@ def test_hf_qwen2():
     # https://github.com/Lightning-AI/lightning-thunder/issues/1407
     torch.testing.assert_close(compiled_loss, ref_loss, rtol=1e-4, atol=1e-4)
 
-    assert len(backend.subgraph_infos) == 1, "Should have exactly 1 subgraph because of fullgraph=True"
+    if fullgraph:
+        assert len(backend.subgraph_infos) == 1, "Should have exactly 1 subgraph because of fullgraph=True"
     loss_grad = torch.randn_like(compiled_loss)
 
     grads_ref = torch.autograd.grad(ref_loss, model.parameters(), grad_outputs=loss_grad)

From 70aa30bc7646064809e4633a1f1d7e9a47db58c2 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:50:49 +0200
Subject: [PATCH 5/6] Remove link to qwen2 json

---
 thunder/tests/test_networks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py
index 4ecb06b9a6..a1a3f0ca3f 100644
--- a/thunder/tests/test_networks.py
+++ b/thunder/tests/test_networks.py
@@ -406,7 +406,6 @@ def test_hf_for_nemo(model_id):
     from thunder.dynamo import ThunderCompiler
     from transformers import AutoConfig, AutoModelForCausalLM
 
-    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
     configuration = AutoConfig.from_pretrained(
         model_id,
         # Scaled down for testing

From 687a4486de8c68fd7b166dee2ffe2af11e46903e Mon Sep 17 00:00:00 2001
From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com>
Date: Thu, 14 Nov 2024 15:26:01 +0200
Subject: [PATCH 6/6] bump transformers version

---
 requirements/test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index c9ccf66f84..9ea8bc8ab6 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -18,7 +18,7 @@ pandas # thunder/benchmarks/test_benchmark_litgpt.py
 xlsxwriter # thunder/benchmarks/test_benchmark_litgpt.py
 jsonargparse # thunder/benchmarks/benchmark_litgpt.py
 bitsandbytes==0.42.0  # fixed version!
-transformers==4.43.3 # for test_networks.py
+transformers==4.46.2 # for test_networks.py
 
 # Installs JAX on Linux and MacOS
 jaxlib; sys_platform == 'linux' or sys_platform == 'darwin'  # required for jax, see https://github.com/google/jax#installation