diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index be3423f..515692c 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -13,7 +13,7 @@ on: jobs: test: name: Run Tests - if: github.event_name == 'pull_request' || github.event_name == 'push' || github.event_name == 'merge_group' + if: github.event_name == 'pull_request' || github.event_name == 'push' runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -28,4 +28,26 @@ jobs: pip install pytest - name: Run tests run: | - pytest --ignore=tests/test_vllm.py --ignore=tests/test_bedrock.py \ No newline at end of file + pytest --ignore=tests/test_api.py + + merge-queue-only-test: + name: Merge Queue Only Tests + if: github.event_name == 'merge_group' + runs-on: ubuntu-latest + environment: test + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest + - name: Run merge queue specific tests + run: | + pytest tests/test_api.py \ No newline at end of file diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..a890a0d --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,205 @@ +import pytest +import logging +import random +import textgrad as tg +# We'll use below utilities to run a python function. +from IPython.core.interactiveshell import InteractiveShell + +logging.disable(logging.CRITICAL) + +SYSTEM_PROMPT = "You are a smart language model that evaluates code snippets. You do not solve problems or propose new code snippets, only evaluate existing solutions critically and give very concise feedback." +INSTRUCTION = """Think about the problem and the code snippet. Does the code solve the problem? What is the runtime complexity?""" + +PROBLEM_TEXT = """"Longest Increasing Subsequence (LIS) + + Problem Statement: + Given a sequence of integers, find the length of the longest subsequence that is strictly increasing. A subsequence is a sequence that can be derived from another sequence by deleting some or no elements without changing the order of the remaining elements. + + Input: + The input consists of a list of integers representing the sequence. + + Output: + The output should be an integer representing the length of the longest increasing subsequence.""" + +INITIAL_SOLUTION = """ + def longest_increasing_subsequence(nums): + n = len(nums) + dp = [1] * n + + for i in range(1, n): + for j in range(i): + if nums[i] > nums[j]: + dp[i] = max(dp[i], dp[j] + 1) + + max_length = max(dp) + lis = [] + + for i in range(n - 1, -1, -1): + if dp[i] == max_length: + lis.append(nums[i]) + max_length -= 1 + + return len(lis[::-1]) + """ + +BUGGED_SOLUTION = """ +def longest_increasing_subsequence(nums): + n = len(nums) + dp = [1] * n + + for i in range(1, n): + for j in range(i): + if nums[i] > nums[j]: + dp[i] = max(dp[i], dp[j] + 1) + + max_length = max(dp) + lis = [] + + for i in range(n - 1, -1, -1): + if dp[i] == max_length: + lis.append(nums[i]) + max_length -= 1 + + return len(lis[::-1])+1 + """ + + +def generate_random_test_case(size, min_value, max_value): + return [random.randint(min_value, max_value) for _ in range(size)] + + +def run_function_in_interpreter(func_code): + interpreter = InteractiveShell.instance() + + interpreter.run_cell(func_code, store_history=False, silent=True) + + func_name = func_code.split("def ")[1].split("(")[0].strip() + func = interpreter.user_ns[func_name] + + return func + + +def eval_function_with_asserts(fn): + nums = [10, 22, 9, 33, 21, 50, 41, 60] + assert fn(nums) == 5 + + nums = [7, 2, 1, 3, 8, 4, 9, 6, 5] + assert fn(nums) == 4 + + nums = [5, 4, 3, 2, 1] + assert fn(nums) == 1 + + nums = [1, 2, 3, 4, 5] + assert fn(nums) == 5 + + nums = [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5] + assert fn(nums) == 4 + + nums = [10, 9, 2, 5, 3, 7, 101, 18] + assert fn(nums) == 4 + + nums = [0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15] + assert fn(nums) == 6 + + nums = [7, 7, 7, 7, 7, 7, 7] + assert fn(nums) == 1 + + nums = [20, 25, 47, 35, 56, 68, 98, 101, 212, 301, 415, 500] + assert fn(nums) == 11 + + nums = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + assert fn(nums) == 1 + + print("All test cases passed!") + +def test_api(): + + longest_increasing_subsequence = run_function_in_interpreter(INITIAL_SOLUTION) + + eval_function_with_asserts(longest_increasing_subsequence) + + llm_engine = tg.get_engine("experimental:gpt-4o-mini") + tg.set_backward_engine(llm_engine) + + code = tg.Variable(value=INITIAL_SOLUTION, + requires_grad=True, + role_description="code instance to optimize") + + problem = tg.Variable(PROBLEM_TEXT, + requires_grad=False, + role_description="the coding problem") + + optimizer = tg.TGD(parameters=[code]) + + loss_system_prompt = SYSTEM_PROMPT + loss_system_prompt = tg.Variable(loss_system_prompt, requires_grad=False, + role_description="system prompt to the loss function") + + instruction = INSTRUCTION + + format_string = "{instruction}\nProblem: {{problem}}\nCurrent Code: {{code}}" + format_string = format_string.format(instruction=instruction) + + fields = {"problem": None, "code": None} + formatted_llm_call = tg.autograd.FormattedLLMCall(engine=llm_engine, + format_string=format_string, + fields=fields, + system_prompt=loss_system_prompt) + def loss_fn(problem: tg.Variable, code: tg.Variable) -> tg.Variable: + inputs = {"problem": problem, "code": code} + + return formatted_llm_call(inputs=inputs, + response_role_description=f"evaluation of the {code.get_role_description()}") + + loss = loss_fn(problem, code) + loss.backward() + optimizer.step() + longest_increasing_subsequence = run_function_in_interpreter(code.value) + eval_function_with_asserts(longest_increasing_subsequence) + + +def test_bugged(): + + with pytest.raises(Exception): + # bugged solution should throw an exception + longest_increasing_subsequence = run_function_in_interpreter(BUGGED_SOLUTION) + eval_function_with_asserts(longest_increasing_subsequence) + + llm_engine = tg.get_engine("experimental:gpt-4o-mini") + tg.set_backward_engine(llm_engine, override=True) + + code = tg.Variable(value=BUGGED_SOLUTION, + requires_grad=True, + role_description="code instance to optimize") + + problem = tg.Variable(PROBLEM_TEXT, + requires_grad=False, + role_description="the coding problem") + + optimizer = tg.TGD(parameters=[code]) + + loss_system_prompt = SYSTEM_PROMPT + loss_system_prompt = tg.Variable(loss_system_prompt, requires_grad=False, + role_description="system prompt to the loss function") + + instruction = INSTRUCTION + + format_string = "{instruction}\nProblem: {{problem}}\nCurrent Code: {{code}}" + format_string = format_string.format(instruction=instruction) + + fields = {"problem": None, "code": None} + formatted_llm_call = tg.autograd.FormattedLLMCall(engine=llm_engine, + format_string=format_string, + fields=fields, + system_prompt=loss_system_prompt) + def loss_fn(problem: tg.Variable, code: tg.Variable) -> tg.Variable: + inputs = {"problem": problem, "code": code} + + return formatted_llm_call(inputs=inputs, + response_role_description=f"evaluation of the {code.get_role_description()}") + + loss = loss_fn(problem, code) + loss.backward() + optimizer.step() + longest_increasing_subsequence = run_function_in_interpreter(code.value) + eval_function_with_asserts(longest_increasing_subsequence) \ No newline at end of file diff --git a/tests/test_bedrock.py b/tests/test_bedrock.py deleted file mode 100644 index ce8d70c..0000000 --- a/tests/test_bedrock.py +++ /dev/null @@ -1,75 +0,0 @@ -from unittest.mock import Mock - -import pytest -from botocore.client import BaseClient -from langchain_aws import ChatBedrock - -from textgrad.engine.bedrock import ChatBedrockEngine - - -@pytest.fixture -def mock_bedrock_client(): - return Mock(spec=BaseClient) - - -def test_chat_bedrock_engine_init_custom_values(mock_bedrock_client): - custom_model_kwargs = {"temperature": 0.7, "max_tokens": 1000} - custom_model_string = "anthropic.claude-3-haiku-20240307-v1:0" - custom_system_prompt = "You are the best AI assistant ever." - - engine = ChatBedrockEngine( - bedrock_client=mock_bedrock_client, - model_string=custom_model_string, - system_prompt=custom_system_prompt, - is_multimodal=True, - **custom_model_kwargs - ) - - assert isinstance(engine.client, ChatBedrock) - assert engine.model_string == custom_model_string - assert engine.system_prompt == custom_system_prompt - assert engine.is_multimodal is True - assert engine.kwargs == custom_model_kwargs - - -def test_chat_bedrock_engine_init_default_values(mock_bedrock_client): - engine = ChatBedrockEngine(bedrock_client=mock_bedrock_client) - - assert isinstance(engine.client, ChatBedrock) - assert engine.model_string == "anthropic.claude-3-sonnet-20240229-v1:0" - assert engine.system_prompt == ChatBedrockEngine.SYSTEM_PROMPT - assert engine.is_multimodal is False - assert engine.kwargs == {} - - -def test_chat_bedrock_engine_invalid_system_prompt(mock_bedrock_client): - with pytest.raises(AssertionError): - ChatBedrockEngine(bedrock_client=mock_bedrock_client, system_prompt=123) - - -def test_chat_bedrock_engine_call(mock_bedrock_client): - model_kwargs = {"temperature": 0.7, "max_tokens": 1000} - additional_kwargs = {"temperature": 0.8} - - engine = ChatBedrockEngine(bedrock_client=mock_bedrock_client, **model_kwargs) - - engine.generate = Mock(return_value="Mocked response") - - prompt = "Hello, how are you?" - response = engine(prompt, **additional_kwargs) - assert response == "Mocked response" - engine.generate.assert_called_with(prompt, max_tokens=1000, temperature=0.8) - - response = engine(prompt) - assert response == "Mocked response" - engine.generate.assert_called_with(prompt, max_tokens=1000, temperature=0.7) - - -def test_generate_with_string_input(mock_bedrock_client): - engine = ChatBedrockEngine(bedrock_client=mock_bedrock_client) - engine._generate_from_single_prompt = Mock(return_value="Mocked response") - - response = engine.generate("Hello, how are you?") - - assert response == "Mocked response" - engine._generate_from_single_prompt.assert_called_once_with("Hello, how are you?", system_prompt=None) \ No newline at end of file diff --git a/tests/test_vllm.py b/tests/test_vllm.py deleted file mode 100644 index d3de221..0000000 --- a/tests/test_vllm.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import logging -import pytest -from typing import Union, List - -from textgrad import Variable, BlackboxLLM, TextLoss -from textgrad.optimizer import TextualGradientDescent -from textgrad.engine.vllm import ChatVLLM - -logging.disable(logging.CRITICAL) -vllm_engine = ChatVLLM(model_string="meta-llama/Meta-Llama-3-8B-Instruct") - -def test_import_vllm(): - assert ChatVLLM - -def test_simple_forward_pass_engine(): - text = Variable("Hello", role_description="A variable") - engine = BlackboxLLM(engine=vllm_engine) - response = engine(text) - - assert response - -def test_primitives(): - """ - Test the basic functionality of the Variable class. - """ - x = Variable("A sntence with a typo", role_description="The input sentence", requires_grad=True) - system_prompt = Variable("Evaluate the correctness of this sentence", role_description="The system prompt") - loss = TextLoss(system_prompt, engine=vllm_engine) - optimizer = TextualGradientDescent(parameters=[x], engine=vllm_engine) - - l = loss(x) - l.backward(vllm_engine) - optimizer.step() - - assert x.value == "A sentence with a typo"