From 178519379a67baa885123d4276216c1c314332af Mon Sep 17 00:00:00 2001
From: Anton Dubovik <anton_dubovik@epam.com>
Date: Wed, 30 Oct 2024 11:40:51 +0000
Subject: [PATCH] fix: fixed integration tests (#147)

---
 .env.example                                  |   3 +-
 README.md                                     |   9 +-
 aidial_adapter_bedrock/aws_client_config.py   |   4 +-
 aidial_adapter_bedrock/llm/truncate_prompt.py |   5 +-
 aidial_adapter_bedrock/utils/env.py           |   2 +
 noxfile.py                                    |   4 +-
 pyproject.toml                                |   3 +-
 tests/conftest.py                             |  50 +++++-
 .../integration_tests/test_chat_completion.py | 170 ++++++++++++------
 tests/integration_tests/test_embeddings.py    |  12 +-
 tests/integration_tests/test_models.py        |  30 +---
 tests/unit_tests/conftest.py                  |   8 -
 tests/unit_tests/test_app.py                  |  10 +-
 .../test_aws_client_config_factory.py         |   6 +-
 tests/unit_tests/test_endpoints.py            |  44 +++--
 tests/unit_tests/test_truncate_prompt.py      |  11 ++
 tests/utils/openai.py                         |  12 --
 tests/utils/server.py                         |  57 ------
 18 files changed, 229 insertions(+), 211 deletions(-)
 delete mode 100644 tests/unit_tests/conftest.py
 delete mode 100644 tests/utils/server.py
diff --git a/.env.example b/.env.example
index 2929d963..5315c476 100644
--- a/.env.example
+++ b/.env.example
@@ -7,5 +7,4 @@ DIAL_URL=<dial core url>
 
 # Misc env vars for the server
 LOG_LEVEL=INFO # Default in prod is INFO. Use DEBUG for dev.
-WEB_CONCURRENCY=1 # Number of uvicorn workers
-TEST_SERVER_URL=http://0.0.0.0:5001
\ No newline at end of file
+WEB_CONCURRENCY=1 # Number of uvicorn workers
\ No newline at end of file
diff --git a/README.md b/README.md
index edebf176..eb76c2e3 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,10 @@ Note that a model supports `/truncate_prompt` endpoint if and only if it support
 
 |Vendor|Model|Deployment name|Modality|`/tokenize`|`/truncate_prompt`, `max_prompt_tokens`|tools/functions|
 |---|---|---|---|---|---|---|
-|Anthropic|Claude 3.5 Sonnet|anthropic.claude-3-5-sonnet-20240620-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
-|Anthropic|Claude 3 Sonnet|anthropic.claude-3-sonnet-20240229-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
-|Anthropic|Claude 3 Haiku|anthropic.claude-3-haiku-20240307-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
-|Anthropic|Claude 3 Opus|anthropic.claude-3-opus-20240229-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
+|Anthropic|Claude 3.5 Sonnet|[us.\|eu.]anthropic.claude-3-5-sonnet-20240620-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
+|Anthropic|Claude 3 Sonnet|[us.\|eu.]anthropic.claude-3-sonnet-20240229-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
+|Anthropic|Claude 3 Haiku|[us.\|eu.]anthropic.claude-3-haiku-20240307-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
+|Anthropic|Claude 3 Opus|[us.]anthropic.claude-3-opus-20240229-v1:0|text-to-text, image-to-text|🟡|🟡|✅|
 |Anthropic|Claude 2.1|anthropic.claude-v2:1|text-to-text|✅|✅|✅|
 |Anthropic|Claude 2|anthropic.claude-v2|text-to-text|✅|✅|❌|
 |Anthropic|Claude Instant 1.2|anthropic.claude-instant-v1|text-to-text|🟡|🟡|❌|
@@ -105,7 +105,6 @@ Copy `.env.example` to `.env` and customize it for your environment:
 |AIDIAL_LOG_LEVEL|WARNING|AI DIAL SDK log level|
 |DIAL_URL||URL of the core DIAL server. If defined, images generated by Stability are uploaded to the DIAL file storage and attachments are returned with URLs pointing to the images. Otherwise, the images are returned as base64 encoded strings.|
 |WEB_CONCURRENCY|1|Number of workers for the server|
-|TEST_SERVER_URL|http://0.0.0.0:5001|Server URL used in the integration tests|
 
 ## Load balancing
 
diff --git a/aidial_adapter_bedrock/aws_client_config.py b/aidial_adapter_bedrock/aws_client_config.py
index 9f995472..6b9123a0 100644
--- a/aidial_adapter_bedrock/aws_client_config.py
+++ b/aidial_adapter_bedrock/aws_client_config.py
@@ -2,7 +2,7 @@
 
 import boto3
 from aidial_sdk.embeddings import Request
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from aidial_adapter_bedrock.utils.concurrency import make_async
 from aidial_adapter_bedrock.utils.env import get_aws_default_region
@@ -43,7 +43,7 @@ def get_anthropic_bedrock_client_kwargs(self) -> dict:
 
 
 class UpstreamConfig(BaseModel):
-    region: str = get_aws_default_region()
+    region: str = Field(default_factory=get_aws_default_region)
     aws_access_key_id: str | None = None
     aws_secret_access_key: str | None = None
     aws_assume_role_arn: str | None = os.environ.get("AWS_ASSUME_ROLE_ARN")
diff --git a/aidial_adapter_bedrock/llm/truncate_prompt.py b/aidial_adapter_bedrock/llm/truncate_prompt.py
index f7735c5c..7c4de10a 100644
--- a/aidial_adapter_bedrock/llm/truncate_prompt.py
+++ b/aidial_adapter_bedrock/llm/truncate_prompt.py
@@ -51,9 +51,8 @@ def to_dial_exception(self) -> DialException:
 
 
 def _partition_indexer(chunks: List[int]) -> Callable[[int], List[int]]:
-    """Returns a function that maps an index to indices of its partition.
-    >>> [_partition_indexer([2, 3])(i) for i in range(5)]
-    [[0, 1], [0, 1], [2, 3, 4], [2, 3, 4], [2, 3, 4]]
+    """
+    Returns a function that maps an index to indices of its partition.
     """
     mapping: dict[int, List[int]] = {}
     offset = 0
diff --git a/aidial_adapter_bedrock/utils/env.py b/aidial_adapter_bedrock/utils/env.py
index cae7bcb4..05cd3a2d 100644
--- a/aidial_adapter_bedrock/utils/env.py
+++ b/aidial_adapter_bedrock/utils/env.py
@@ -1,4 +1,5 @@
 import os
+from functools import cache
 from typing import Optional
 
 from aidial_adapter_bedrock.utils.log_config import app_logger as log
@@ -13,6 +14,7 @@ def get_env(name: str, err_msg: Optional[str] = None) -> str:
     raise Exception(err_msg or f"{name} env variable is not set")
 
 
+@cache
 def get_aws_default_region() -> str:
     region = os.getenv("DEFAULT_REGION")
     if region is not None:
diff --git a/noxfile.py b/noxfile.py
index c6a27330..850f3ad4 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -35,7 +35,7 @@ def format(session: nox.Session):
 
 def run_tests(session: nox.Session, *args):
     session.run("poetry", "install", external=True)
-    session.run("pytest", "aidial_adapter_bedrock", *args)
+    session.run("pytest", *args)
 
 
 @nox.session
@@ -45,4 +45,4 @@ def test(session: nox.Session):
 
 @nox.session
 def integration_tests(session: nox.Session):
-    run_tests(session, "-n=auto", "tests/integration_tests/")
+    run_tests(session, "tests/integration_tests/")
diff --git a/pyproject.toml b/pyproject.toml
index 8e216382..7034abba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,8 @@ flake8 = "6.0.0"
 nox = "^2023.4.22"
 
 [tool.pytest.ini_options]
-addopts = "--doctest-modules"
+addopts="-n=auto"
+env_override_existing_values = 1
 # muting warnings coming from opentelemetry package
 filterwarnings = [
     "ignore::DeprecationWarning:opentelemetry.instrumentation.dependencies"
diff --git a/tests/conftest.py b/tests/conftest.py
index e96ed59a..c343116d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,15 +1,47 @@
-import os
+from typing import Mapping
 
+import httpx
 import pytest
+import pytest_asyncio
+from httpx import ASGITransport
+from openai import AsyncAzureOpenAI
 
-from tests.utils.server import server_generator
 
-DEFAULT_API_VERSION = "2023-03-15-preview"
-TEST_SERVER_URL = os.getenv("TEST_SERVER_URL", "http://0.0.0.0:5001")
+@pytest.fixture(autouse=True)
+def configure_unit_tests(monkeypatch, request):
+    """
+    Set up fake environment variables for unit tests.
+    """
+    if "tests/unit_tests" in request.node.nodeid:
+        monkeypatch.setenv("AWS_DEFAULT_REGION", "test-region")
 
 
-@pytest.fixture(scope="module")
-def server():
-    yield from server_generator(
-        "aidial_adapter_bedrock.app:app", TEST_SERVER_URL
-    )
+@pytest_asyncio.fixture
+async def test_http_client():
+    from aidial_adapter_bedrock.app import app
+
+    async with httpx.AsyncClient(
+        transport=ASGITransport(app),  # type: ignore
+        base_url="http://test-app.com",
+    ) as client:
+        yield client
+
+
+@pytest.fixture
+def get_openai_client(test_http_client: httpx.AsyncClient):
+    def _get_client(
+        deployment_id: str | None = None,
+        extra_headers: Mapping[str, str] | None = None,
+    ) -> AsyncAzureOpenAI:
+        return AsyncAzureOpenAI(
+            azure_endpoint=str(test_http_client.base_url),
+            azure_deployment=deployment_id,
+            api_version="",
+            api_key="dummy_key",
+            max_retries=2,
+            timeout=30,
+            http_client=test_http_client,
+            default_headers=extra_headers,
+        )
+
+    yield _get_client
diff --git a/tests/integration_tests/test_chat_completion.py b/tests/integration_tests/test_chat_completion.py
index b5815f4f..80f8da57 100644
--- a/tests/integration_tests/test_chat_completion.py
+++ b/tests/integration_tests/test_chat_completion.py
@@ -1,6 +1,6 @@
 import re
 from dataclasses import dataclass
-from typing import Callable, List
+from typing import Callable, List, Mapping
 
 import pytest
 from openai import APIError, BadRequestError, UnprocessableEntityError
@@ -11,9 +11,12 @@
 from openai.types.chat.completion_create_params import Function
 from pydantic import BaseModel
 
+from aidial_adapter_bedrock.aws_client_config import (
+    AWSClientConfigFactory,
+    UpstreamConfig,
+)
 from aidial_adapter_bedrock.deployments import ChatCompletionDeployment
 from aidial_adapter_bedrock.utils.resource import Resource
-from tests.conftest import TEST_SERVER_URL
 from tests.utils.openai import (
     GET_WEATHER_FUNCTION,
     ChatCompletionResult,
@@ -24,7 +27,6 @@
     function_request,
     function_response,
     function_to_tool,
-    get_client,
     is_valid_function_call,
     is_valid_tool_call,
     sanitize_test_name,
@@ -53,6 +55,7 @@ class TestCase:
     __test__ = False
 
     name: str
+    region: str
     deployment: ChatCompletionDeployment
     streaming: bool
 
@@ -78,28 +81,32 @@ def get_id(self):
         )
 
 
-chat_deployments = [
-    ChatCompletionDeployment.AMAZON_TITAN_TG1_LARGE,
-    ChatCompletionDeployment.AI21_J2_GRANDE_INSTRUCT,
-    ChatCompletionDeployment.AI21_J2_JUMBO_INSTRUCT,
-    ChatCompletionDeployment.AI21_J2_MID_V1,
-    ChatCompletionDeployment.AI21_J2_ULTRA_V1,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_INSTANT_V1,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V2,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V2_1,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET_US,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET,
-    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET_US,
-    ChatCompletionDeployment.META_LLAMA2_13B_CHAT_V1,
-    ChatCompletionDeployment.META_LLAMA2_70B_CHAT_V1,
-    ChatCompletionDeployment.META_LLAMA3_8B_INSTRUCT_V1,
-    ChatCompletionDeployment.META_LLAMA3_70B_INSTRUCT_V1,
-    ChatCompletionDeployment.META_LLAMA3_1_405B_INSTRUCT_V1,
-    ChatCompletionDeployment.META_LLAMA3_1_70B_INSTRUCT_V1,
-    ChatCompletionDeployment.META_LLAMA3_1_8B_INSTRUCT_V1,
-    ChatCompletionDeployment.COHERE_COMMAND_TEXT_V14,
-]
+_EAST = "us-east-1"
+_WEST = "us-west-2"
+
+chat_deployments: Mapping[ChatCompletionDeployment, str] = {
+    ChatCompletionDeployment.AMAZON_TITAN_TG1_LARGE: _WEST,
+    ChatCompletionDeployment.AI21_J2_GRANDE_INSTRUCT: _EAST,
+    ChatCompletionDeployment.AI21_J2_JUMBO_INSTRUCT: _EAST,
+    ChatCompletionDeployment.AI21_J2_MID_V1: _EAST,
+    ChatCompletionDeployment.AI21_J2_ULTRA_V1: _EAST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_INSTANT_V1: _WEST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V2: _WEST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V2_1: _WEST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET: _WEST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET_US: _WEST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET: _WEST,
+    ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET_US: _WEST,
+    ChatCompletionDeployment.META_LLAMA2_13B_CHAT_V1: _WEST,
+    ChatCompletionDeployment.META_LLAMA2_70B_CHAT_V1: _WEST,
+    ChatCompletionDeployment.META_LLAMA3_8B_INSTRUCT_V1: _WEST,
+    ChatCompletionDeployment.META_LLAMA3_70B_INSTRUCT_V1: _WEST,
+    ChatCompletionDeployment.META_LLAMA3_1_405B_INSTRUCT_V1: _WEST,
+    ChatCompletionDeployment.META_LLAMA3_1_70B_INSTRUCT_V1: _WEST,
+    ChatCompletionDeployment.META_LLAMA3_1_8B_INSTRUCT_V1: _WEST,
+    ChatCompletionDeployment.COHERE_COMMAND_TEXT_V14: _WEST,
+    ChatCompletionDeployment.COHERE_COMMAND_LIGHT_TEXT_V14: _WEST,
+}
 
 
 def supports_tools(deployment: ChatCompletionDeployment) -> bool:
@@ -126,11 +133,21 @@ def is_llama3(deployment: ChatCompletionDeployment) -> bool:
     ]
 
 
+def is_cohere(deployment: ChatCompletionDeployment) -> bool:
+    return deployment in [
+        ChatCompletionDeployment.COHERE_COMMAND_LIGHT_TEXT_V14,
+        ChatCompletionDeployment.COHERE_COMMAND_TEXT_V14,
+    ]
+
+
 def is_claude3(deployment: ChatCompletionDeployment) -> bool:
     return deployment in [
         ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET,
         ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET_US,
         ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_SONNET_EU,
+        ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET,
+        ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET_US,
+        ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_5_SONNET_EU,
         ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_HAIKU,
         ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_HAIKU_US,
         ChatCompletionDeployment.ANTHROPIC_CLAUDE_V3_HAIKU_EU,
@@ -139,6 +156,20 @@ def is_claude3(deployment: ChatCompletionDeployment) -> bool:
     ]
 
 
+def is_ai21(deployment: ChatCompletionDeployment) -> bool:
+    return deployment in [
+        ChatCompletionDeployment.AI21_J2_GRANDE_INSTRUCT,
+        ChatCompletionDeployment.AI21_J2_JUMBO_INSTRUCT,
+    ]
+
+
+cohere_invalid_request_error = ExpectedException(
+    type=BadRequestError,
+    message="Invalid parameter combination",
+    status_code=400,
+)
+
+
 def is_vision_model(deployment: ChatCompletionDeployment) -> bool:
     return is_claude3(deployment)
 
@@ -156,7 +187,7 @@ def are_tools_emulated(deployment: ChatCompletionDeployment) -> bool:
 
 
 def get_test_cases(
-    deployment: ChatCompletionDeployment, streaming: bool
+    deployment: ChatCompletionDeployment, region: str, streaming: bool
 ) -> List[TestCase]:
     test_cases: List[TestCase] = []
 
@@ -184,6 +215,7 @@ def test_case(
         test_cases.append(
             TestCase(
                 name,
+                region,
                 deployment,
                 streaming,
                 messages,
@@ -196,6 +228,17 @@ def test_case(
             )
         )
 
+    def dial_recall_expected(r: ChatCompletionResult):
+        content = r.content.lower()
+        success = "anton" in content
+        # Amazon Titan and Cohere performances have degraded recently
+        if deployment in [
+            ChatCompletionDeployment.AMAZON_TITAN_TG1_LARGE,
+            ChatCompletionDeployment.COHERE_COMMAND_TEXT_V14,
+        ]:
+            return not success
+        return success
+
     test_case(
         name="dialog recall",
         messages=[
@@ -203,7 +246,8 @@ def test_case(
             ai("nice to meet you"),
             user("what's my name?"),
         ],
-        expected=lambda s: "anton" in s.content.lower(),
+        max_tokens=32,
+        expected=dial_recall_expected,
     )
 
     test_case(
@@ -240,38 +284,46 @@ def test_case(
         ),
     )
 
+    expected_empty_message_error = expected_success
+    if is_claude3(deployment):
+        expected_empty_message_error = streaming_error(
+            ExpectedException(
+                type=BadRequestError,
+                message="messages: text content blocks must be non-empty",
+                status_code=400,
+            )
+        )
+    elif is_cohere(deployment):
+        expected_empty_message_error = streaming_error(
+            cohere_invalid_request_error
+        )
+
     test_case(
         name="empty user message",
         max_tokens=1,
         messages=[user("")],
-        expected=(
-            streaming_error(
-                ExpectedException(
-                    type=BadRequestError,
-                    message="messages: text content blocks must be non-empty",
-                    status_code=400,
-                )
-            )
-            if is_claude3(deployment)
-            else expected_success
-        ),
+        expected=expected_empty_message_error,
     )
 
+    expected_whitespace_message = expected_success
+    if is_claude3(deployment):
+        expected_whitespace_message = streaming_error(
+            ExpectedException(
+                type=BadRequestError,
+                message="messages: text content blocks must contain non-whitespace text",
+                status_code=400,
+            )
+        )
+    elif is_cohere(deployment):
+        expected_whitespace_message = streaming_error(
+            cohere_invalid_request_error
+        )
+
     test_case(
         name="single space user message",
         max_tokens=1,
         messages=[user(" ")],
-        expected=(
-            streaming_error(
-                ExpectedException(
-                    type=BadRequestError,
-                    message="messages: text content blocks must contain non-whitespace text",
-                    status_code=400,
-                )
-            )
-            if is_claude3(deployment)
-            else expected_success
-        ),
+        expected=expected_whitespace_message,
     )
 
     if is_vision_model(deployment):
@@ -298,7 +350,7 @@ def test_case(
     )
 
     # ai21 models do not support more than one stop word
-    if "ai21" in deployment.model_id:
+    if is_ai21(deployment):
         stop = ["John"]
     else:
         stop = ["John", "john"]
@@ -463,19 +515,29 @@ def _check(id: str) -> bool:
     return test_cases
 
 
+def get_extra_headers(region: str) -> Mapping[str, str]:
+    return {
+        AWSClientConfigFactory.UPSTREAM_CONFIG_HEADER_NAME: UpstreamConfig(
+            region=region
+        ).json()
+    }
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "test",
     [
         test
-        for deployment in chat_deployments
+        for deployment, region in chat_deployments.items()
         for streaming in [False, True]
-        for test in get_test_cases(deployment, streaming)
+        for test in get_test_cases(deployment, region, streaming)
     ],
     ids=lambda test: test.get_id(),
 )
-async def test_chat_completion_openai(server, test: TestCase):
-    client = get_client(TEST_SERVER_URL, test.deployment.value)
+async def test_chat_completion_openai(get_openai_client, test: TestCase):
+    client = get_openai_client(
+        test.deployment.value, get_extra_headers(test.region)
+    )
 
     async def run_chat_completion() -> ChatCompletionResult:
         return await chat_completion(
diff --git a/tests/integration_tests/test_embeddings.py b/tests/integration_tests/test_embeddings.py
index fa87d3cd..59d2c60f 100644
--- a/tests/integration_tests/test_embeddings.py
+++ b/tests/integration_tests/test_embeddings.py
@@ -3,13 +3,11 @@
 from typing import Any, Callable, List
 
 import pytest
-from openai import AsyncAzureOpenAI
 from openai.types import CreateEmbeddingResponse
 
 from aidial_adapter_bedrock.deployments import EmbeddingsDeployment
 from aidial_adapter_bedrock.llm.consumer import Attachment
 from aidial_adapter_bedrock.utils.json import remove_nones
-from tests.conftest import DEFAULT_API_VERSION, TEST_SERVER_URL
 from tests.utils.openai import sanitize_test_name
 
 
@@ -210,15 +208,9 @@ def get_image_test_cases(
     ],
     ids=lambda test: test.get_id(),
 )
-async def test_embeddings(server, test: TestCase):
+async def test_embeddings(get_openai_client, test: TestCase):
     model_id = test.deployment.value
-
-    client = AsyncAzureOpenAI(
-        azure_endpoint=TEST_SERVER_URL,
-        azure_deployment=model_id,
-        api_version=DEFAULT_API_VERSION,
-        api_key="dummy_key",
-    )
+    client = get_openai_client(model_id)
 
     async def run() -> CreateEmbeddingResponse:
         return await client.embeddings.create(
diff --git a/tests/integration_tests/test_models.py b/tests/integration_tests/test_models.py
index 996fe6b0..cadf8152 100644
--- a/tests/integration_tests/test_models.py
+++ b/tests/integration_tests/test_models.py
@@ -1,26 +1,13 @@
 from typing import List
 
-import requests
-from openai import AzureOpenAI
+import pytest
+from openai import AsyncAzureOpenAI
 
 from aidial_adapter_bedrock.deployments import ChatCompletionDeployment
-from tests.conftest import DEFAULT_API_VERSION, TEST_SERVER_URL
 
 
-def models_request_http() -> List[str]:
-    response = requests.get(f"{TEST_SERVER_URL}/openai/models")
-    assert response.status_code == 200
-    data = response.json()["data"]
-    return [model["id"] for model in data]
-
-
-def models_request_openai() -> List[str]:
-    client = AzureOpenAI(
-        azure_endpoint=TEST_SERVER_URL,
-        api_version=DEFAULT_API_VERSION,
-        api_key="dummy_key",
-    )
-    data = client.models.list().data
+async def models_request_openai(client: AsyncAzureOpenAI) -> List[str]:
+    data = (await client.models.list()).data
     return [model.id for model in data]
 
 
@@ -32,9 +19,6 @@ def assert_models_subset(actual_models: List[str]):
     ), f"Expected models: {expected_models}, Actual models: {actual_models}"
 
 
-def test_model_list_http(server):
-    assert_models_subset(models_request_http())
-
-
-def test_model_list_openai(server):
-    assert_models_subset(models_request_openai())
+@pytest.mark.asyncio
+async def test_model_list_openai(get_openai_client):
+    assert_models_subset(await models_request_openai(get_openai_client()))
diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py
deleted file mode 100644
index 691d2528..00000000
--- a/tests/unit_tests/conftest.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import os
-
-
-def pytest_configure(config):
-    """
-    Setting up fake environment variables for unit tests.
-    """
-    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
diff --git a/tests/unit_tests/test_app.py b/tests/unit_tests/test_app.py
index 855a6d78..9d737072 100644
--- a/tests/unit_tests/test_app.py
+++ b/tests/unit_tests/test_app.py
@@ -1,10 +1,10 @@
 from typing import Any
 
-import requests
+import httpx
+import pytest
 
-from tests.conftest import TEST_SERVER_URL
 
-
-def test_availability(server) -> Any:
-    response = requests.get(f"{TEST_SERVER_URL}/health")
+@pytest.mark.asyncio
+async def test_availability(test_http_client: httpx.AsyncClient) -> Any:
+    response = await test_http_client.get("health")
     assert response.status_code == 200
diff --git a/tests/unit_tests/test_aws_client_config_factory.py b/tests/unit_tests/test_aws_client_config_factory.py
index 23219144..702de80e 100644
--- a/tests/unit_tests/test_aws_client_config_factory.py
+++ b/tests/unit_tests/test_aws_client_config_factory.py
@@ -28,7 +28,7 @@ async def test__get_client_config__default_region_in_config(self):
             request=request
         ).get_client_config()
 
-        assert client_config.region == "us-east-1"
+        assert client_config.region == "test-region"
         assert client_config.credentials is None
 
     async def test__get_client_config__region_provided__region_in_config(self):
@@ -52,7 +52,7 @@ async def test__get_client_config__key_in_config(self):
             request=request,
         ).get_client_config()
 
-        assert client_config.region == "us-east-1"
+        assert client_config.region == "test-region"
         assert client_config.credentials is not None
         assert client_config.credentials.aws_access_key_id == "key_id"
         assert client_config.credentials.aws_secret_access_key == "key"
@@ -76,7 +76,7 @@ async def test__get_client_config__role_arn__tmp_credentials_in_config(
             request=request,
         ).get_client_config()
 
-        assert client_config.region == "us-east-1"
+        assert client_config.region == "test-region"
         assert client_config.credentials is not None
         assert client_config.credentials.aws_access_key_id == "key_id"
         assert client_config.credentials.aws_secret_access_key == "key"
diff --git a/tests/unit_tests/test_endpoints.py b/tests/unit_tests/test_endpoints.py
index ea0165c2..8871aa1c 100644
--- a/tests/unit_tests/test_endpoints.py
+++ b/tests/unit_tests/test_endpoints.py
@@ -1,10 +1,9 @@
 from typing import List, Tuple
 
+import httpx
 import pytest
-import requests
 
 from aidial_adapter_bedrock.deployments import ChatCompletionDeployment
-from tests.conftest import TEST_SERVER_URL
 
 test_cases: List[Tuple[ChatCompletionDeployment, bool, bool]] = [
     (ChatCompletionDeployment.AMAZON_TITAN_TG1_LARGE, True, True),
@@ -40,22 +39,25 @@
 ]
 
 
-def feature_test_helper(
-    url: str, is_supported: bool, headers: dict, payload: dict
+async def assert_feature(
+    http_client: httpx.AsyncClient,
+    endpoint: str,
+    is_supported: bool,
+    headers: dict,
+    payload: dict,
 ) -> None:
-    response = requests.post(url, json=payload, headers=headers)
+    response = await http_client.post(endpoint, json=payload, headers=headers)
     assert (
         response.status_code != 404
-    ) == is_supported, (
-        f"is_supported={is_supported}, code={response.status_code}, url={url}"
-    )
+    ) == is_supported, f"is_supported={is_supported}, code={response.status_code}, url={endpoint}"
 
 
+@pytest.mark.asyncio
 @pytest.mark.parametrize(
     "deployment, tokenize_supported, truncate_supported", test_cases
 )
-def test_model_features(
-    server,
+async def test_model_features(
+    test_http_client: httpx.AsyncClient,
     deployment: ChatCompletionDeployment,
     tokenize_supported: bool,
     truncate_supported: bool,
@@ -63,10 +65,22 @@ def test_model_features(
     payload = {"inputs": []}
     headers = {"Content-Type": "application/json", "Api-Key": "dummy"}
 
-    BASE_URL = f"{TEST_SERVER_URL}/openai/deployments/{deployment.value}"
+    base = f"openai/deployments/{deployment.value}"
 
-    tokenize_url = f"{BASE_URL}/tokenize"
-    feature_test_helper(tokenize_url, tokenize_supported, headers, payload)
+    tokenize_endpoint = f"{base}/tokenize"
+    await assert_feature(
+        test_http_client,
+        tokenize_endpoint,
+        tokenize_supported,
+        headers,
+        payload,
+    )
 
-    truncate_url = f"{BASE_URL}/truncate_prompt"
-    feature_test_helper(truncate_url, truncate_supported, headers, payload)
+    truncate_endpoint = f"{base}/truncate_prompt"
+    await assert_feature(
+        test_http_client,
+        truncate_endpoint,
+        truncate_supported,
+        headers,
+        payload,
+    )
diff --git a/tests/unit_tests/test_truncate_prompt.py b/tests/unit_tests/test_truncate_prompt.py
index 28221f94..437e1765 100644
--- a/tests/unit_tests/test_truncate_prompt.py
+++ b/tests/unit_tests/test_truncate_prompt.py
@@ -10,6 +10,7 @@
 from aidial_adapter_bedrock.llm.truncate_prompt import (
     DiscardedMessages,
     TruncatePromptError,
+    _partition_indexer,
     compute_discarded_messages,
 )
 from tests.utils.messages import ai, sys, user
@@ -33,6 +34,16 @@ async def _tokenize_by_words(messages: List[BaseMessage]) -> int:
     )
 
 
+def test_partition_indexer():
+    assert [_partition_indexer([2, 3])(i) for i in range(5)] == [
+        [0, 1],
+        [0, 1],
+        [2, 3, 4],
+        [2, 3, 4],
+        [2, 3, 4],
+    ]
+
+
 @pytest.mark.asyncio
 async def test_no_truncation():
     messages = [
diff --git a/tests/utils/openai.py b/tests/utils/openai.py
index 2f3780fd..0f268841 100644
--- a/tests/utils/openai.py
+++ b/tests/utils/openai.py
@@ -33,7 +33,6 @@
 from pydantic import BaseModel
 
 from aidial_adapter_bedrock.utils.resource import Resource
-from tests.conftest import DEFAULT_API_VERSION
 from tests.utils.json import match_objects
 
 
@@ -220,17 +219,6 @@ async def generator() -> AsyncGenerator[dict, None]:
     return ChatCompletionResult(response=response)
 
 
-def get_client(base_url: str, model_id: str) -> AsyncAzureOpenAI:
-    return AsyncAzureOpenAI(
-        azure_endpoint=base_url,
-        azure_deployment=model_id,
-        api_version=DEFAULT_API_VERSION,
-        api_key="dummy_key",
-        max_retries=0,
-        timeout=30,
-    )
-
-
 GET_WEATHER_FUNCTION: Function = {
     "name": "get_current_weather",
     "description": "Get the current weather",
diff --git a/tests/utils/server.py b/tests/utils/server.py
deleted file mode 100644
index c503c529..00000000
--- a/tests/utils/server.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import time
-from multiprocessing import Process
-from urllib.parse import urlparse
-
-import requests
-import uvicorn
-
-
-def ping_server(url: str) -> bool:
-    try:
-        requests.get(url, timeout=1)
-        return True
-    except requests.ConnectionError:
-        return False
-
-
-def wait_for_server(url: str, timeout=10) -> None:
-    start_time = time.time()
-
-    while True:
-        if ping_server(url):
-            return
-
-        if time.time() - start_time > timeout:
-            raise Exception("The test server didn't start in time!")
-
-        time.sleep(0.1)
-
-
-def terminate_process(process: Process):
-    process.terminate()
-    process.join()
-
-
-def server_generator(module: str, url: str):
-    already_exists = ping_server(url)
-
-    server_process: Process | None = None
-    if not already_exists:
-        parsed_url = urlparse(url)
-        server_process = Process(
-            target=uvicorn.run,
-            args=(module,),
-            kwargs={"host": parsed_url.hostname, "port": parsed_url.port},
-        )
-        server_process.start()
-
-        try:
-            wait_for_server(url)
-        except Exception as e:
-            terminate_process(server_process)
-            raise Exception("Can't start the test server") from e
-
-    yield
-
-    if server_process is not None:
-        terminate_process(server_process)