From df9aa25c1a407565bf1381f6e2916c7cfbd353cd Mon Sep 17 00:00:00 2001 From: Pierce Kelaita Date: Tue, 17 Dec 2024 13:06:19 -0800 Subject: [PATCH] Add o1 models --- CHANGELOG.md | 4 ++ README.md | 16 ++++-- l2m2/client/base_llm_client.py | 7 ++- l2m2/model_info.py | 51 +++++++++++++++++++ tests/l2m2/client/test_base_llm_client.py | 62 +++++++++++++---------- 5 files changed, 108 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f0ead7..262bcd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,10 @@ _Current version: 0.0.39_ #### Added - Support for [Llama 3.3 70b](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_3/) via [Groq](https://console.groq.com/docs/models) and [Cerebras](https://inference-docs.cerebras.ai/introduction). +- Support for OpenAI's [o1 series](https://openai.com/o1/): `o1`, `o1-preview`, and `o1-mini`. + +> [!NOTE] +> At the time of this release, you must be on OpenAI's [usage tier](https://platform.openai.com/docs/guides/rate-limits) 5 to use `o1` and tier 1+ to use `o1-preview` and `o1-mini`. #### Removed diff --git a/README.md b/README.md index a64abd9..5be63b1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # L2M2: A Simple Python LLM Manager 💬👍 -[![Tests](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml/badge.svg?timestamp=1734052060)](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml) [![codecov](https://codecov.io/github/pkelaita/l2m2/graph/badge.svg?token=UWIB0L9PR8)](https://codecov.io/github/pkelaita/l2m2) [![PyPI version](https://badge.fury.io/py/l2m2.svg?timestamp=1734052060)](https://badge.fury.io/py/l2m2) +[![Tests](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml/badge.svg?timestamp=1734470191)](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml) [![codecov](https://codecov.io/github/pkelaita/l2m2/graph/badge.svg?token=UWIB0L9PR8)](https://codecov.io/github/pkelaita/l2m2) [![PyPI version](https://badge.fury.io/py/l2m2.svg?timestamp=1734470191)](https://badge.fury.io/py/l2m2) **L2M2** ("LLM Manager" → "LLMM" → "L2M2") is a tiny and very simple LLM manager for Python that exposes lots of models through a unified API. This is useful for evaluation, demos, production applications etc. that need to easily be model-agnostic. @@ -8,7 +8,7 @@ ### Features -- 29 supported models (see below) – regularly updated and with more on the way. +- 31 supported models (see below) – regularly updated and with more on the way. - Session chat memory – even across multiple models or with concurrent memory streams. - JSON mode - Prompt loading tools @@ -29,6 +29,9 @@ L2M2 currently supports the following models: | --------------------- | ------------------------------------------------------------------ | --------------------------------------------------- | | `gpt-4o` | [OpenAI](https://openai.com/product) | `gpt-4o-2024-11-20` | | `gpt-4o-mini` | [OpenAI](https://openai.com/product) | `gpt-4o-mini-2024-07-18` | +| `o1` | [OpenAI](https://openai.com/product) | `o1` | +| `o1-preview` | [OpenAI](https://openai.com/product) | `o1-preview` | +| `o1-mini` | [OpenAI](https://openai.com/product) | `o1-mini` | | `gpt-4-turbo` | [OpenAI](https://openai.com/product) | `gpt-4-turbo-2024-04-09` | | `gpt-3.5-turbo` | [OpenAI](https://openai.com/product) | `gpt-3.5-turbo-0125` | | `gemini-2.0-flash` | [Google](https://ai.google.dev/) | `gemini-2.0-flash-exp` | @@ -47,15 +50,17 @@ L2M2 currently supports the following models: | `ministral-8b` | [Mistral](https://mistral.ai/) | `ministral-8b-latest` | | `mistral-small` | [Mistral](https://mistral.ai/) | `mistral-small-latest` | | `mixtral-8x7b` | [Groq](https://wow.groq.com/) | `mixtral-8x7b-32768` | -| `gemma-7b` | [Groq](https://wow.groq.com/) | `gemma-7b-it` | | `gemma-2-9b` | [Groq](https://wow.groq.com/) | `gemma2-9b-it` | | `llama-3-8b` | [Groq](https://wow.groq.com/), [Replicate](https://replicate.com/) | `llama3-8b-8192`, `meta/meta-llama-3-8b-instruct` | | `llama-3-70b` | [Groq](https://wow.groq.com/), [Replicate](https://replicate.com/) | `llama3-70b-8192`, `meta/meta-llama-3-70b-instruct` | | `llama-3.1-8b` | [Groq](https://wow.groq.com/), [Cerebras](https://cerebras.ai/) | `llama-3.1-8b-instant`, `llama3.1-8b` | -| `llama-3.1-70b` | [Groq](https://wow.groq.com/), [Cerebras](https://cerebras.ai/) | `llama-3.1-70b-versatile`, `llama3.1-70b` | | `llama-3.1-405b` | [Replicate](https://replicate.com/) | `meta/meta-llama-3.1-405b-instruct` | | `llama-3.2-1b` | [Groq](https://wow.groq.com/) | `llama-3.2-1b-preview` | | `llama-3.2-3b` | [Groq](https://wow.groq.com/) | `llama-3.2-3b-preview` | +| `llama-3.3-70b` | [Groq](https://wow.groq.com/), [Cerebras](https://cerebras.ai/) | `llama-3.3-70b-versatile`, `llama3.3-70b` | + +> [!NOTE] +> Currently, you must be on OpenAI's [usage tier](https://platform.openai.com/docs/guides/rate-limits) 5 to use `o1` and tier 1+ to use `o1-preview`, `o1-mini`, and `gpt-4o`. @@ -651,8 +656,9 @@ Your name is Pierce and you are a software engineer. ## Planned Features +- Support for structured outputs where available (Just OpenAI as far as I know) +- Support for OSS and self-hosted (Hugging Face, Ollama, Gpt4all, etc.) - Support for batch APIs where available (OpenAI, Anthropic, etc.) -- Support for OSS and self-hosted (Hugging Face, Gpt4all, etc.) - Basic (i.e., customizable & non-opinionated) agent & multi-agent system features - Tools for common application workflows: RAG, prompt management, search, etc. - Support for streaming responses diff --git a/l2m2/client/base_llm_client.py b/l2m2/client/base_llm_client.py index ef85993..f411ac1 100644 --- a/l2m2/client/base_llm_client.py +++ b/l2m2/client/base_llm_client.py @@ -687,9 +687,14 @@ async def _generic_openai_spec_call( """Generic call method for providers who follow the OpenAI API spec.""" supports_native_json_mode = "json_mode_arg" in extras + # For o1 and newer, use "developer" messages instead of "system" + system_key = "system" + if provider == "openai" and model_id in ["o1", "o1-preview", "o1-mini"]: + system_key = "developer" + messages = [] if system_prompt is not None: - messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": system_key, "content": system_prompt}) if isinstance(memory, ChatMemory): messages.extend(memory.unpack("role", "content", "user", "assistant")) messages.append({"role": "user", "content": prompt}) diff --git a/l2m2/model_info.py b/l2m2/model_info.py index 0340850..32ac284 100644 --- a/l2m2/model_info.py +++ b/l2m2/model_info.py @@ -157,6 +157,57 @@ class ModelEntry(TypedDict): "extras": {"json_mode_arg": {"response_format": {"type": "json_object"}}}, }, }, + "o1": { + "openai": { + "model_id": "o1", + "params": { + "temperature": { + "default": PROVIDER_DEFAULT, + "max": 1.0, + }, + "max_tokens": { + "custom_key": "max_completion_tokens", + "default": PROVIDER_DEFAULT, + "max": 4096, + }, + }, + "extras": {}, + }, + }, + "o1-preview": { + "openai": { + "model_id": "o1-preview", + "params": { + "temperature": { + "default": PROVIDER_DEFAULT, + "max": 1.0, + }, + "max_tokens": { + "custom_key": "max_completion_tokens", + "default": PROVIDER_DEFAULT, + "max": 4096, + }, + }, + "extras": {}, + }, + }, + "o1-mini": { + "openai": { + "model_id": "o1-mini", + "params": { + "temperature": { + "default": PROVIDER_DEFAULT, + "max": 1.0, + }, + "max_tokens": { + "custom_key": "max_completion_tokens", + "default": PROVIDER_DEFAULT, + "max": 4096, + }, + }, + "extras": {}, + }, + }, "gpt-4-turbo": { "openai": { "model_id": "gpt-4-turbo-2024-04-09", diff --git a/tests/l2m2/client/test_base_llm_client.py b/tests/l2m2/client/test_base_llm_client.py index 46a18a5..5bee6c6 100644 --- a/tests/l2m2/client/test_base_llm_client.py +++ b/tests/l2m2/client/test_base_llm_client.py @@ -73,7 +73,7 @@ async def test_init_with_providers(): "cohere": "test-key-cohere", } assert llm_client.active_providers == {"openai", "cohere"} - assert "gpt-4-turbo" in llm_client.active_models + assert "gpt-4o" in llm_client.active_models assert "command-r" in llm_client.active_models assert "claude-3-opus" not in llm_client.active_models @@ -89,7 +89,7 @@ async def test_init_with_env_providers(): "cohere": "test-key-cohere", } assert llm_client.active_providers == {"openai", "cohere"} - assert "gpt-4-turbo" in llm_client.active_models + assert "gpt-4o" in llm_client.active_models assert "command-r" in llm_client.active_models assert "claude-3-opus" not in llm_client.active_models @@ -111,7 +111,7 @@ async def test_init_with_env_providers_override(): "anthropic": "new-key-anthropic", } assert llm_client.active_providers == {"openai", "cohere", "anthropic"} - assert "gpt-4-turbo" in llm_client.active_models + assert "gpt-4o" in llm_client.active_models assert "command-r" in llm_client.active_models assert "claude-3-opus" in llm_client.active_models @@ -127,7 +127,7 @@ def test_getters(llm_client): assert llm_client.get_active_providers() == {"openai", "cohere"} active_models = llm_client.get_active_models() - assert "gpt-4-turbo" in active_models + assert "gpt-4o" in active_models assert "command-r" in active_models assert "claude-3-opus" not in active_models @@ -143,7 +143,7 @@ def test_getters(llm_client): def test_add_provider(llm_client): llm_client.add_provider("openai", "test-key-openai") assert "openai" in llm_client.active_providers - assert "gpt-4-turbo" in llm_client.active_models + assert "gpt-4o" in llm_client.active_models def test_add_provider_invalid(llm_client): @@ -165,7 +165,7 @@ def test_remove_provider(llm_client): assert "openai" not in llm_client.active_providers assert "anthropic" in llm_client.active_providers - assert "gpt-4-turbo" not in llm_client.active_models + assert "gpt-4o" not in llm_client.active_models assert "claude-3-opus" in llm_client.active_models @@ -253,18 +253,30 @@ async def test_call_openai(mock_get_extra_message, mock_llm_post, llm_client): mock_get_extra_message.return_value = "extra message" mock_return_value = {"choices": [{"message": {"content": "response"}}]} mock_llm_post.return_value = mock_return_value - await _generic_test_call(llm_client, "openai", "gpt-4-turbo") + await _generic_test_call(llm_client, "openai", "gpt-4o") -# Need to test gemini 1.0 and 1.5 separately because of different system prompt handling +# Need to test this separately because of the different system prompt handling @pytest.mark.asyncio @patch(LLM_POST_PATH) @patch(GET_EXTRA_MESSAGE_PATH) -async def test_call_google_1_5(mock_get_extra_message, mock_llm_post, llm_client): +async def test_call_openai_o1_or_newer( + mock_get_extra_message, mock_llm_post, llm_client +): + mock_get_extra_message.return_value = "extra message" + mock_return_value = {"choices": [{"message": {"content": "response"}}]} + mock_llm_post.return_value = mock_return_value + await _generic_test_call(llm_client, "openai", "o1") + + +@pytest.mark.asyncio +@patch(LLM_POST_PATH) +@patch(GET_EXTRA_MESSAGE_PATH) +async def test_call_google(mock_get_extra_message, mock_llm_post, llm_client): mock_get_extra_message.return_value = "extra message" mock_return_value = {"candidates": [{"content": {"parts": [{"text": "response"}]}}]} mock_llm_post.return_value = mock_return_value - await _generic_test_call(llm_client, "google", "gemini-1.5-pro") + await _generic_test_call(llm_client, "google", "gemini-2.0-flash") @pytest.mark.asyncio @@ -340,7 +352,7 @@ async def test_call_google_gemini_fails(mock_llm_post, llm_client): @pytest.mark.asyncio async def test_call_valid_model_not_active(llm_client): with pytest.raises(ValueError): - await llm_client.call(prompt="Hello", model="gpt-4-turbo") + await llm_client.call(prompt="Hello", model="gpt-4o") @pytest.mark.asyncio @@ -353,16 +365,14 @@ async def test_call_invalid_model(llm_client): async def test_call_tokens_too_large(llm_client): llm_client.add_provider("openai", "fake-api-key") with pytest.raises(ValueError): - await llm_client.call( - prompt="Hello", model="gpt-4-turbo", max_tokens=float("inf") - ) + await llm_client.call(prompt="Hello", model="gpt-4o", max_tokens=float("inf")) @pytest.mark.asyncio async def test_call_temperature_too_high(llm_client): llm_client.add_provider("openai", "fake-api-key") with pytest.raises(ValueError): - await llm_client.call(prompt="Hello", model="gpt-4-turbo", temperature=3.0) + await llm_client.call(prompt="Hello", model="gpt-4o", temperature=3.0) # -- Tests for call_custom -- # @@ -499,7 +509,7 @@ async def test_chat_memory(mock_call_openai, llm_client_mem_chat): memory.add_user_message("A") memory.add_agent_message("B") - response = await llm_client_mem_chat.call(prompt="C", model="gpt-4-turbo") + response = await llm_client_mem_chat.call(prompt="C", model="gpt-4o") assert response == "response" assert memory.unpack("role", "content", "user", "assistant") == [ {"role": "user", "content": "A"}, @@ -542,14 +552,14 @@ async def test_external_memory_system_prompt(mock_call_openai, llm_client_mem_ex memory.set_contents("stuff") - await llm_client_mem_ext_sys.call(prompt="Hello", model="gpt-4-turbo") + await llm_client_mem_ext_sys.call(prompt="Hello", model="gpt-4o") assert mock_call_openai.call_args.kwargs["data"]["messages"] == [ {"role": "system", "content": "stuff"}, {"role": "user", "content": "Hello"}, ] await llm_client_mem_ext_sys.call( - system_prompt="system-123", prompt="Hello", model="gpt-4-turbo" + system_prompt="system-123", prompt="Hello", model="gpt-4o" ) assert mock_call_openai.call_args.kwargs["data"]["messages"] == [ {"role": "system", "content": "system-123\nstuff"}, @@ -568,13 +578,13 @@ async def test_external_memory_user_prompt(mock_call_openai, llm_client_mem_ext_ memory.set_contents("stuff") - await llm_client_mem_ext_usr.call(prompt="Hello", model="gpt-4-turbo") + await llm_client_mem_ext_usr.call(prompt="Hello", model="gpt-4o") assert mock_call_openai.call_args.kwargs["data"]["messages"] == [ {"role": "user", "content": "Hello\nstuff"}, ] await llm_client_mem_ext_usr.call( - system_prompt="system-123", prompt="Hello", model="gpt-4-turbo" + system_prompt="system-123", prompt="Hello", model="gpt-4o" ) assert mock_call_openai.call_args.kwargs["data"]["messages"] == [ {"role": "system", "content": "system-123"}, @@ -627,12 +637,12 @@ async def test_alt_memory(mock_call_openai, llm_client): m2 = ChatMemory() llm_client.load_memory(ChatMemory()) - await llm_client.call(prompt="A", model="gpt-4-turbo", alt_memory=m1) - await llm_client.call(prompt="X", model="gpt-4-turbo", alt_memory=m2) - await llm_client.call(prompt="B", model="gpt-4-turbo", alt_memory=m1) - await llm_client.call(prompt="Y", model="gpt-4-turbo", alt_memory=m2) - await llm_client.call(prompt="C", model="gpt-4-turbo", alt_memory=m1) - await llm_client.call(prompt="Z", model="gpt-4-turbo", alt_memory=m2) + await llm_client.call(prompt="A", model="gpt-4o", alt_memory=m1) + await llm_client.call(prompt="X", model="gpt-4o", alt_memory=m2) + await llm_client.call(prompt="B", model="gpt-4o", alt_memory=m1) + await llm_client.call(prompt="Y", model="gpt-4o", alt_memory=m2) + await llm_client.call(prompt="C", model="gpt-4o", alt_memory=m1) + await llm_client.call(prompt="Z", model="gpt-4o", alt_memory=m2) assert m1.unpack("role", "content", "user", "assistant") == [ {"role": "user", "content": "A"},