diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml
index be3423f..515692c 100644
--- a/.github/workflows/merge_queue.yml
+++ b/.github/workflows/merge_queue.yml
@@ -13,7 +13,7 @@ on:
 jobs:
   test:
     name: Run Tests
-    if: github.event_name == 'pull_request' || github.event_name == 'push' || github.event_name == 'merge_group'
+    if: github.event_name == 'pull_request' || github.event_name == 'push'
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -28,4 +28,26 @@ jobs:
         pip install pytest
     - name: Run tests
       run: |
-        pytest --ignore=tests/test_vllm.py --ignore=tests/test_bedrock.py
\ No newline at end of file
+        pytest --ignore=tests/test_api.py
+        
+  merge-queue-only-test:
+    name: Merge Queue Only Tests
+    if: github.event_name == 'merge_group'
+    runs-on: ubuntu-latest
+    environment: test
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+      - name: Run merge queue specific tests
+        run: |
+          pytest tests/test_api.py
\ No newline at end of file
diff --git a/tests/test_api.py b/tests/test_api.py
new file mode 100644
index 0000000..a890a0d
--- /dev/null
+++ b/tests/test_api.py
@@ -0,0 +1,205 @@
+import pytest
+import logging
+import random
+import textgrad as tg
+# We'll use below utilities to run a python function.
+from IPython.core.interactiveshell import InteractiveShell
+
+logging.disable(logging.CRITICAL)
+
+SYSTEM_PROMPT = "You are a smart language model that evaluates code snippets. You do not solve problems or propose new code snippets, only evaluate existing solutions critically and give very concise feedback."
+INSTRUCTION = """Think about the problem and the code snippet. Does the code solve the problem? What is the runtime complexity?"""
+
+PROBLEM_TEXT = """"Longest Increasing Subsequence (LIS)
+    
+    Problem Statement:
+    Given a sequence of integers, find the length of the longest subsequence that is strictly increasing. A subsequence is a sequence that can be derived from another sequence by deleting some or no elements without changing the order of the remaining elements.
+    
+    Input:
+    The input consists of a list of integers representing the sequence.
+    
+    Output:
+    The output should be an integer representing the length of the longest increasing subsequence."""
+
+INITIAL_SOLUTION = """
+    def longest_increasing_subsequence(nums):
+        n = len(nums)
+        dp = [1] * n
+    
+        for i in range(1, n):
+            for j in range(i):
+                if nums[i] > nums[j]:
+                    dp[i] = max(dp[i], dp[j] + 1)
+    
+        max_length = max(dp)
+        lis = []
+    
+        for i in range(n - 1, -1, -1):
+            if dp[i] == max_length:
+                lis.append(nums[i])
+                max_length -= 1
+    
+        return len(lis[::-1]) 
+        """
+
+BUGGED_SOLUTION = """
+def longest_increasing_subsequence(nums):
+        n = len(nums)
+        dp = [1] * n
+    
+        for i in range(1, n):
+            for j in range(i):
+                if nums[i] > nums[j]:
+                    dp[i] = max(dp[i], dp[j] + 1)
+    
+        max_length = max(dp)
+        lis = []
+    
+        for i in range(n - 1, -1, -1):
+            if dp[i] == max_length:
+                lis.append(nums[i])
+                max_length -= 1
+    
+        return len(lis[::-1])+1
+        """
+
+
+def generate_random_test_case(size, min_value, max_value):
+    return [random.randint(min_value, max_value) for _ in range(size)]
+
+
+def run_function_in_interpreter(func_code):
+    interpreter = InteractiveShell.instance()
+
+    interpreter.run_cell(func_code, store_history=False, silent=True)
+
+    func_name = func_code.split("def ")[1].split("(")[0].strip()
+    func = interpreter.user_ns[func_name]
+
+    return func
+
+
+def eval_function_with_asserts(fn):
+    nums = [10, 22, 9, 33, 21, 50, 41, 60]
+    assert fn(nums) == 5
+
+    nums = [7, 2, 1, 3, 8, 4, 9, 6, 5]
+    assert fn(nums) == 4
+
+    nums = [5, 4, 3, 2, 1]
+    assert fn(nums) == 1
+
+    nums = [1, 2, 3, 4, 5]
+    assert fn(nums) == 5
+
+    nums = [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5]
+    assert fn(nums) == 4
+
+    nums = [10, 9, 2, 5, 3, 7, 101, 18]
+    assert fn(nums) == 4
+
+    nums = [0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15]
+    assert fn(nums) == 6
+
+    nums = [7, 7, 7, 7, 7, 7, 7]
+    assert fn(nums) == 1
+
+    nums = [20, 25, 47, 35, 56, 68, 98, 101, 212, 301, 415, 500]
+    assert fn(nums) == 11
+
+    nums = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+    assert fn(nums) == 1
+
+    print("All test cases passed!")
+
+def test_api():
+
+    longest_increasing_subsequence = run_function_in_interpreter(INITIAL_SOLUTION)
+
+    eval_function_with_asserts(longest_increasing_subsequence)
+
+    llm_engine = tg.get_engine("experimental:gpt-4o-mini")
+    tg.set_backward_engine(llm_engine)
+
+    code = tg.Variable(value=INITIAL_SOLUTION,
+                       requires_grad=True,
+                       role_description="code instance to optimize")
+
+    problem = tg.Variable(PROBLEM_TEXT,
+                          requires_grad=False,
+                          role_description="the coding problem")
+
+    optimizer = tg.TGD(parameters=[code])
+
+    loss_system_prompt = SYSTEM_PROMPT
+    loss_system_prompt = tg.Variable(loss_system_prompt, requires_grad=False,
+                                     role_description="system prompt to the loss function")
+
+    instruction = INSTRUCTION
+
+    format_string = "{instruction}\nProblem: {{problem}}\nCurrent Code: {{code}}"
+    format_string = format_string.format(instruction=instruction)
+
+    fields = {"problem": None, "code": None}
+    formatted_llm_call = tg.autograd.FormattedLLMCall(engine=llm_engine,
+                                                      format_string=format_string,
+                                                      fields=fields,
+                                                      system_prompt=loss_system_prompt)
+    def loss_fn(problem: tg.Variable, code: tg.Variable) -> tg.Variable:
+        inputs = {"problem": problem, "code": code}
+
+        return formatted_llm_call(inputs=inputs,
+                                  response_role_description=f"evaluation of the {code.get_role_description()}")
+
+    loss = loss_fn(problem, code)
+    loss.backward()
+    optimizer.step()
+    longest_increasing_subsequence = run_function_in_interpreter(code.value)
+    eval_function_with_asserts(longest_increasing_subsequence)
+
+
+def test_bugged():
+
+    with pytest.raises(Exception):
+        # bugged solution should throw an exception
+        longest_increasing_subsequence = run_function_in_interpreter(BUGGED_SOLUTION)
+        eval_function_with_asserts(longest_increasing_subsequence)
+
+    llm_engine = tg.get_engine("experimental:gpt-4o-mini")
+    tg.set_backward_engine(llm_engine, override=True)
+
+    code = tg.Variable(value=BUGGED_SOLUTION,
+                       requires_grad=True,
+                       role_description="code instance to optimize")
+
+    problem = tg.Variable(PROBLEM_TEXT,
+                          requires_grad=False,
+                          role_description="the coding problem")
+
+    optimizer = tg.TGD(parameters=[code])
+
+    loss_system_prompt = SYSTEM_PROMPT
+    loss_system_prompt = tg.Variable(loss_system_prompt, requires_grad=False,
+                                     role_description="system prompt to the loss function")
+
+    instruction = INSTRUCTION
+
+    format_string = "{instruction}\nProblem: {{problem}}\nCurrent Code: {{code}}"
+    format_string = format_string.format(instruction=instruction)
+
+    fields = {"problem": None, "code": None}
+    formatted_llm_call = tg.autograd.FormattedLLMCall(engine=llm_engine,
+                                                      format_string=format_string,
+                                                      fields=fields,
+                                                      system_prompt=loss_system_prompt)
+    def loss_fn(problem: tg.Variable, code: tg.Variable) -> tg.Variable:
+        inputs = {"problem": problem, "code": code}
+
+        return formatted_llm_call(inputs=inputs,
+                                  response_role_description=f"evaluation of the {code.get_role_description()}")
+
+    loss = loss_fn(problem, code)
+    loss.backward()
+    optimizer.step()
+    longest_increasing_subsequence = run_function_in_interpreter(code.value)
+    eval_function_with_asserts(longest_increasing_subsequence)
\ No newline at end of file
diff --git a/tests/test_bedrock.py b/tests/test_bedrock.py
deleted file mode 100644
index ce8d70c..0000000
--- a/tests/test_bedrock.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from unittest.mock import Mock
-
-import pytest
-from botocore.client import BaseClient
-from langchain_aws import ChatBedrock
-
-from textgrad.engine.bedrock import ChatBedrockEngine
-
-
-@pytest.fixture
-def mock_bedrock_client():
-    return Mock(spec=BaseClient)
-
-
-def test_chat_bedrock_engine_init_custom_values(mock_bedrock_client):
-    custom_model_kwargs = {"temperature": 0.7, "max_tokens": 1000}
-    custom_model_string = "anthropic.claude-3-haiku-20240307-v1:0"
-    custom_system_prompt = "You are the best AI assistant ever."
-
-    engine = ChatBedrockEngine(
-        bedrock_client=mock_bedrock_client,
-        model_string=custom_model_string,
-        system_prompt=custom_system_prompt,
-        is_multimodal=True,
-        **custom_model_kwargs
-    )
-
-    assert isinstance(engine.client, ChatBedrock)
-    assert engine.model_string == custom_model_string
-    assert engine.system_prompt == custom_system_prompt
-    assert engine.is_multimodal is True
-    assert engine.kwargs == custom_model_kwargs
-
-
-def test_chat_bedrock_engine_init_default_values(mock_bedrock_client):
-    engine = ChatBedrockEngine(bedrock_client=mock_bedrock_client)
-
-    assert isinstance(engine.client, ChatBedrock)
-    assert engine.model_string == "anthropic.claude-3-sonnet-20240229-v1:0"
-    assert engine.system_prompt == ChatBedrockEngine.SYSTEM_PROMPT
-    assert engine.is_multimodal is False
-    assert engine.kwargs == {}
-
-
-def test_chat_bedrock_engine_invalid_system_prompt(mock_bedrock_client):
-    with pytest.raises(AssertionError):
-        ChatBedrockEngine(bedrock_client=mock_bedrock_client, system_prompt=123)
-
-
-def test_chat_bedrock_engine_call(mock_bedrock_client):
-    model_kwargs = {"temperature": 0.7, "max_tokens": 1000}
-    additional_kwargs = {"temperature": 0.8}
-
-    engine = ChatBedrockEngine(bedrock_client=mock_bedrock_client, **model_kwargs)
-
-    engine.generate = Mock(return_value="Mocked response")
-
-    prompt = "Hello, how are you?"
-    response = engine(prompt, **additional_kwargs)
-    assert response == "Mocked response"
-    engine.generate.assert_called_with(prompt, max_tokens=1000, temperature=0.8)
-
-    response = engine(prompt)
-    assert response == "Mocked response"
-    engine.generate.assert_called_with(prompt, max_tokens=1000, temperature=0.7)
-
-
-def test_generate_with_string_input(mock_bedrock_client):
-    engine = ChatBedrockEngine(bedrock_client=mock_bedrock_client)
-    engine._generate_from_single_prompt = Mock(return_value="Mocked response")
-
-    response = engine.generate("Hello, how are you?")
-
-    assert response == "Mocked response"
-    engine._generate_from_single_prompt.assert_called_once_with("Hello, how are you?", system_prompt=None)
\ No newline at end of file
diff --git a/tests/test_vllm.py b/tests/test_vllm.py
deleted file mode 100644
index d3de221..0000000
--- a/tests/test_vllm.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-import logging
-import pytest
-from typing import Union, List
-
-from textgrad import Variable, BlackboxLLM, TextLoss
-from textgrad.optimizer import TextualGradientDescent
-from textgrad.engine.vllm import ChatVLLM
-
-logging.disable(logging.CRITICAL)
-vllm_engine = ChatVLLM(model_string="meta-llama/Meta-Llama-3-8B-Instruct")
-
-def test_import_vllm():
-    assert ChatVLLM
-
-def test_simple_forward_pass_engine():
-    text = Variable("Hello", role_description="A variable")
-    engine = BlackboxLLM(engine=vllm_engine)
-    response = engine(text)
-
-    assert response
-
-def test_primitives():
-    """
-    Test the basic functionality of the Variable class.
-    """
-    x = Variable("A sntence with a typo", role_description="The input sentence", requires_grad=True)
-    system_prompt = Variable("Evaluate the correctness of this sentence", role_description="The system prompt")
-    loss = TextLoss(system_prompt, engine=vllm_engine)
-    optimizer = TextualGradientDescent(parameters=[x], engine=vllm_engine)
-
-    l = loss(x)
-    l.backward(vllm_engine)
-    optimizer.step()
-
-    assert x.value == "A sentence with a typo"