From df9aa25c1a407565bf1381f6e2916c7cfbd353cd Mon Sep 17 00:00:00 2001
From: Pierce Kelaita <pierce@kelaita.com>
Date: Tue, 17 Dec 2024 13:06:19 -0800
Subject: [PATCH] Add o1 models

---
 CHANGELOG.md                              |  4 ++
 README.md                                 | 16 ++++--
 l2m2/client/base_llm_client.py            |  7 ++-
 l2m2/model_info.py                        | 51 +++++++++++++++++++
 tests/l2m2/client/test_base_llm_client.py | 62 +++++++++++++----------
 5 files changed, 108 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f0ead7..262bcd7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,10 @@ _Current version: 0.0.39_
 #### Added
 
 - Support for [Llama 3.3 70b](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_3/) via [Groq](https://console.groq.com/docs/models) and [Cerebras](https://inference-docs.cerebras.ai/introduction).
+- Support for OpenAI's [o1 series](https://openai.com/o1/): `o1`, `o1-preview`, and `o1-mini`.
+
+> [!NOTE]
+> At the time of this release, you must be on OpenAI's [usage tier](https://platform.openai.com/docs/guides/rate-limits) 5 to use `o1` and tier 1+ to use `o1-preview` and `o1-mini`.
 
 #### Removed
 
diff --git a/README.md b/README.md
index a64abd9..5be63b1 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # L2M2: A Simple Python LLM Manager 💬👍
 
-[![Tests](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml/badge.svg?timestamp=1734052060)](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml) [![codecov](https://codecov.io/github/pkelaita/l2m2/graph/badge.svg?token=UWIB0L9PR8)](https://codecov.io/github/pkelaita/l2m2) [![PyPI version](https://badge.fury.io/py/l2m2.svg?timestamp=1734052060)](https://badge.fury.io/py/l2m2)
+[![Tests](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml/badge.svg?timestamp=1734470191)](https://github.com/pkelaita/l2m2/actions/workflows/tests.yml) [![codecov](https://codecov.io/github/pkelaita/l2m2/graph/badge.svg?token=UWIB0L9PR8)](https://codecov.io/github/pkelaita/l2m2) [![PyPI version](https://badge.fury.io/py/l2m2.svg?timestamp=1734470191)](https://badge.fury.io/py/l2m2)
 
 **L2M2** ("LLM Manager" &rarr; "LLMM" &rarr; "L2M2") is a tiny and very simple LLM manager for Python that exposes lots of models through a unified API. This is useful for evaluation, demos, production applications etc. that need to easily be model-agnostic.
 
@@ -8,7 +8,7 @@
 
 ### Features
 
-- <!--start-count-->29<!--end-count--> supported models (see below) – regularly updated and with more on the way.
+- <!--start-count-->31<!--end-count--> supported models (see below) – regularly updated and with more on the way.
 - Session chat memory – even across multiple models or with concurrent memory streams.
 - JSON mode
 - Prompt loading tools
@@ -29,6 +29,9 @@ L2M2 currently supports the following models:
 | --------------------- | ------------------------------------------------------------------ | --------------------------------------------------- |
 | `gpt-4o`              | [OpenAI](https://openai.com/product)                               | `gpt-4o-2024-11-20`                                 |
 | `gpt-4o-mini`         | [OpenAI](https://openai.com/product)                               | `gpt-4o-mini-2024-07-18`                            |
+| `o1`                  | [OpenAI](https://openai.com/product)                               | `o1`                                                |
+| `o1-preview`          | [OpenAI](https://openai.com/product)                               | `o1-preview`                                        |
+| `o1-mini`             | [OpenAI](https://openai.com/product)                               | `o1-mini`                                           |
 | `gpt-4-turbo`         | [OpenAI](https://openai.com/product)                               | `gpt-4-turbo-2024-04-09`                            |
 | `gpt-3.5-turbo`       | [OpenAI](https://openai.com/product)                               | `gpt-3.5-turbo-0125`                                |
 | `gemini-2.0-flash`    | [Google](https://ai.google.dev/)                                   | `gemini-2.0-flash-exp`                              |
@@ -47,15 +50,17 @@ L2M2 currently supports the following models:
 | `ministral-8b`        | [Mistral](https://mistral.ai/)                                     | `ministral-8b-latest`                               |
 | `mistral-small`       | [Mistral](https://mistral.ai/)                                     | `mistral-small-latest`                              |
 | `mixtral-8x7b`        | [Groq](https://wow.groq.com/)                                      | `mixtral-8x7b-32768`                                |
-| `gemma-7b`            | [Groq](https://wow.groq.com/)                                      | `gemma-7b-it`                                       |
 | `gemma-2-9b`          | [Groq](https://wow.groq.com/)                                      | `gemma2-9b-it`                                      |
 | `llama-3-8b`          | [Groq](https://wow.groq.com/), [Replicate](https://replicate.com/) | `llama3-8b-8192`, `meta/meta-llama-3-8b-instruct`   |
 | `llama-3-70b`         | [Groq](https://wow.groq.com/), [Replicate](https://replicate.com/) | `llama3-70b-8192`, `meta/meta-llama-3-70b-instruct` |
 | `llama-3.1-8b`        | [Groq](https://wow.groq.com/), [Cerebras](https://cerebras.ai/)    | `llama-3.1-8b-instant`, `llama3.1-8b`               |
-| `llama-3.1-70b`       | [Groq](https://wow.groq.com/), [Cerebras](https://cerebras.ai/)    | `llama-3.1-70b-versatile`, `llama3.1-70b`           |
 | `llama-3.1-405b`      | [Replicate](https://replicate.com/)                                | `meta/meta-llama-3.1-405b-instruct`                 |
 | `llama-3.2-1b`        | [Groq](https://wow.groq.com/)                                      | `llama-3.2-1b-preview`                              |
 | `llama-3.2-3b`        | [Groq](https://wow.groq.com/)                                      | `llama-3.2-3b-preview`                              |
+| `llama-3.3-70b`       | [Groq](https://wow.groq.com/), [Cerebras](https://cerebras.ai/)    | `llama-3.3-70b-versatile`, `llama3.3-70b`           |
+
+> [!NOTE]
+> Currently, you must be on OpenAI's [usage tier](https://platform.openai.com/docs/guides/rate-limits) 5 to use `o1` and tier 1+ to use `o1-preview`, `o1-mini`, and `gpt-4o`.
 
 <!--end-model-table-->
 
@@ -651,8 +656,9 @@ Your name is Pierce and you are a software engineer.
 
 ## Planned Features
 
+- Support for structured outputs where available (Just OpenAI as far as I know)
+- Support for OSS and self-hosted (Hugging Face, Ollama, Gpt4all, etc.)
 - Support for batch APIs where available (OpenAI, Anthropic, etc.)
-- Support for OSS and self-hosted (Hugging Face, Gpt4all, etc.)
 - Basic (i.e., customizable & non-opinionated) agent & multi-agent system features
 - Tools for common application workflows: RAG, prompt management, search, etc.
 - Support for streaming responses
diff --git a/l2m2/client/base_llm_client.py b/l2m2/client/base_llm_client.py
index ef85993..f411ac1 100644
--- a/l2m2/client/base_llm_client.py
+++ b/l2m2/client/base_llm_client.py
@@ -687,9 +687,14 @@ async def _generic_openai_spec_call(
         """Generic call method for providers who follow the OpenAI API spec."""
         supports_native_json_mode = "json_mode_arg" in extras
 
+        # For o1 and newer, use "developer" messages instead of "system"
+        system_key = "system"
+        if provider == "openai" and model_id in ["o1", "o1-preview", "o1-mini"]:
+            system_key = "developer"
+
         messages = []
         if system_prompt is not None:
-            messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": system_key, "content": system_prompt})
         if isinstance(memory, ChatMemory):
             messages.extend(memory.unpack("role", "content", "user", "assistant"))
         messages.append({"role": "user", "content": prompt})
diff --git a/l2m2/model_info.py b/l2m2/model_info.py
index 0340850..32ac284 100644
--- a/l2m2/model_info.py
+++ b/l2m2/model_info.py
@@ -157,6 +157,57 @@ class ModelEntry(TypedDict):
             "extras": {"json_mode_arg": {"response_format": {"type": "json_object"}}},
         },
     },
+    "o1": {
+        "openai": {
+            "model_id": "o1",
+            "params": {
+                "temperature": {
+                    "default": PROVIDER_DEFAULT,
+                    "max": 1.0,
+                },
+                "max_tokens": {
+                    "custom_key": "max_completion_tokens",
+                    "default": PROVIDER_DEFAULT,
+                    "max": 4096,
+                },
+            },
+            "extras": {},
+        },
+    },
+    "o1-preview": {
+        "openai": {
+            "model_id": "o1-preview",
+            "params": {
+                "temperature": {
+                    "default": PROVIDER_DEFAULT,
+                    "max": 1.0,
+                },
+                "max_tokens": {
+                    "custom_key": "max_completion_tokens",
+                    "default": PROVIDER_DEFAULT,
+                    "max": 4096,
+                },
+            },
+            "extras": {},
+        },
+    },
+    "o1-mini": {
+        "openai": {
+            "model_id": "o1-mini",
+            "params": {
+                "temperature": {
+                    "default": PROVIDER_DEFAULT,
+                    "max": 1.0,
+                },
+                "max_tokens": {
+                    "custom_key": "max_completion_tokens",
+                    "default": PROVIDER_DEFAULT,
+                    "max": 4096,
+                },
+            },
+            "extras": {},
+        },
+    },
     "gpt-4-turbo": {
         "openai": {
             "model_id": "gpt-4-turbo-2024-04-09",
diff --git a/tests/l2m2/client/test_base_llm_client.py b/tests/l2m2/client/test_base_llm_client.py
index 46a18a5..5bee6c6 100644
--- a/tests/l2m2/client/test_base_llm_client.py
+++ b/tests/l2m2/client/test_base_llm_client.py
@@ -73,7 +73,7 @@ async def test_init_with_providers():
             "cohere": "test-key-cohere",
         }
         assert llm_client.active_providers == {"openai", "cohere"}
-        assert "gpt-4-turbo" in llm_client.active_models
+        assert "gpt-4o" in llm_client.active_models
         assert "command-r" in llm_client.active_models
         assert "claude-3-opus" not in llm_client.active_models
 
@@ -89,7 +89,7 @@ async def test_init_with_env_providers():
             "cohere": "test-key-cohere",
         }
         assert llm_client.active_providers == {"openai", "cohere"}
-        assert "gpt-4-turbo" in llm_client.active_models
+        assert "gpt-4o" in llm_client.active_models
         assert "command-r" in llm_client.active_models
         assert "claude-3-opus" not in llm_client.active_models
 
@@ -111,7 +111,7 @@ async def test_init_with_env_providers_override():
             "anthropic": "new-key-anthropic",
         }
         assert llm_client.active_providers == {"openai", "cohere", "anthropic"}
-        assert "gpt-4-turbo" in llm_client.active_models
+        assert "gpt-4o" in llm_client.active_models
         assert "command-r" in llm_client.active_models
         assert "claude-3-opus" in llm_client.active_models
 
@@ -127,7 +127,7 @@ def test_getters(llm_client):
     assert llm_client.get_active_providers() == {"openai", "cohere"}
 
     active_models = llm_client.get_active_models()
-    assert "gpt-4-turbo" in active_models
+    assert "gpt-4o" in active_models
     assert "command-r" in active_models
     assert "claude-3-opus" not in active_models
 
@@ -143,7 +143,7 @@ def test_getters(llm_client):
 def test_add_provider(llm_client):
     llm_client.add_provider("openai", "test-key-openai")
     assert "openai" in llm_client.active_providers
-    assert "gpt-4-turbo" in llm_client.active_models
+    assert "gpt-4o" in llm_client.active_models
 
 
 def test_add_provider_invalid(llm_client):
@@ -165,7 +165,7 @@ def test_remove_provider(llm_client):
 
     assert "openai" not in llm_client.active_providers
     assert "anthropic" in llm_client.active_providers
-    assert "gpt-4-turbo" not in llm_client.active_models
+    assert "gpt-4o" not in llm_client.active_models
     assert "claude-3-opus" in llm_client.active_models
 
 
@@ -253,18 +253,30 @@ async def test_call_openai(mock_get_extra_message, mock_llm_post, llm_client):
     mock_get_extra_message.return_value = "extra message"
     mock_return_value = {"choices": [{"message": {"content": "response"}}]}
     mock_llm_post.return_value = mock_return_value
-    await _generic_test_call(llm_client, "openai", "gpt-4-turbo")
+    await _generic_test_call(llm_client, "openai", "gpt-4o")
 
 
-# Need to test gemini 1.0 and 1.5 separately because of different system prompt handling
+# Need to test this separately because of the different system prompt handling
 @pytest.mark.asyncio
 @patch(LLM_POST_PATH)
 @patch(GET_EXTRA_MESSAGE_PATH)
-async def test_call_google_1_5(mock_get_extra_message, mock_llm_post, llm_client):
+async def test_call_openai_o1_or_newer(
+    mock_get_extra_message, mock_llm_post, llm_client
+):
+    mock_get_extra_message.return_value = "extra message"
+    mock_return_value = {"choices": [{"message": {"content": "response"}}]}
+    mock_llm_post.return_value = mock_return_value
+    await _generic_test_call(llm_client, "openai", "o1")
+
+
+@pytest.mark.asyncio
+@patch(LLM_POST_PATH)
+@patch(GET_EXTRA_MESSAGE_PATH)
+async def test_call_google(mock_get_extra_message, mock_llm_post, llm_client):
     mock_get_extra_message.return_value = "extra message"
     mock_return_value = {"candidates": [{"content": {"parts": [{"text": "response"}]}}]}
     mock_llm_post.return_value = mock_return_value
-    await _generic_test_call(llm_client, "google", "gemini-1.5-pro")
+    await _generic_test_call(llm_client, "google", "gemini-2.0-flash")
 
 
 @pytest.mark.asyncio
@@ -340,7 +352,7 @@ async def test_call_google_gemini_fails(mock_llm_post, llm_client):
 @pytest.mark.asyncio
 async def test_call_valid_model_not_active(llm_client):
     with pytest.raises(ValueError):
-        await llm_client.call(prompt="Hello", model="gpt-4-turbo")
+        await llm_client.call(prompt="Hello", model="gpt-4o")
 
 
 @pytest.mark.asyncio
@@ -353,16 +365,14 @@ async def test_call_invalid_model(llm_client):
 async def test_call_tokens_too_large(llm_client):
     llm_client.add_provider("openai", "fake-api-key")
     with pytest.raises(ValueError):
-        await llm_client.call(
-            prompt="Hello", model="gpt-4-turbo", max_tokens=float("inf")
-        )
+        await llm_client.call(prompt="Hello", model="gpt-4o", max_tokens=float("inf"))
 
 
 @pytest.mark.asyncio
 async def test_call_temperature_too_high(llm_client):
     llm_client.add_provider("openai", "fake-api-key")
     with pytest.raises(ValueError):
-        await llm_client.call(prompt="Hello", model="gpt-4-turbo", temperature=3.0)
+        await llm_client.call(prompt="Hello", model="gpt-4o", temperature=3.0)
 
 
 # -- Tests for call_custom -- #
@@ -499,7 +509,7 @@ async def test_chat_memory(mock_call_openai, llm_client_mem_chat):
     memory.add_user_message("A")
     memory.add_agent_message("B")
 
-    response = await llm_client_mem_chat.call(prompt="C", model="gpt-4-turbo")
+    response = await llm_client_mem_chat.call(prompt="C", model="gpt-4o")
     assert response == "response"
     assert memory.unpack("role", "content", "user", "assistant") == [
         {"role": "user", "content": "A"},
@@ -542,14 +552,14 @@ async def test_external_memory_system_prompt(mock_call_openai, llm_client_mem_ex
 
     memory.set_contents("stuff")
 
-    await llm_client_mem_ext_sys.call(prompt="Hello", model="gpt-4-turbo")
+    await llm_client_mem_ext_sys.call(prompt="Hello", model="gpt-4o")
     assert mock_call_openai.call_args.kwargs["data"]["messages"] == [
         {"role": "system", "content": "stuff"},
         {"role": "user", "content": "Hello"},
     ]
 
     await llm_client_mem_ext_sys.call(
-        system_prompt="system-123", prompt="Hello", model="gpt-4-turbo"
+        system_prompt="system-123", prompt="Hello", model="gpt-4o"
     )
     assert mock_call_openai.call_args.kwargs["data"]["messages"] == [
         {"role": "system", "content": "system-123\nstuff"},
@@ -568,13 +578,13 @@ async def test_external_memory_user_prompt(mock_call_openai, llm_client_mem_ext_
 
     memory.set_contents("stuff")
 
-    await llm_client_mem_ext_usr.call(prompt="Hello", model="gpt-4-turbo")
+    await llm_client_mem_ext_usr.call(prompt="Hello", model="gpt-4o")
     assert mock_call_openai.call_args.kwargs["data"]["messages"] == [
         {"role": "user", "content": "Hello\nstuff"},
     ]
 
     await llm_client_mem_ext_usr.call(
-        system_prompt="system-123", prompt="Hello", model="gpt-4-turbo"
+        system_prompt="system-123", prompt="Hello", model="gpt-4o"
     )
     assert mock_call_openai.call_args.kwargs["data"]["messages"] == [
         {"role": "system", "content": "system-123"},
@@ -627,12 +637,12 @@ async def test_alt_memory(mock_call_openai, llm_client):
     m2 = ChatMemory()
     llm_client.load_memory(ChatMemory())
 
-    await llm_client.call(prompt="A", model="gpt-4-turbo", alt_memory=m1)
-    await llm_client.call(prompt="X", model="gpt-4-turbo", alt_memory=m2)
-    await llm_client.call(prompt="B", model="gpt-4-turbo", alt_memory=m1)
-    await llm_client.call(prompt="Y", model="gpt-4-turbo", alt_memory=m2)
-    await llm_client.call(prompt="C", model="gpt-4-turbo", alt_memory=m1)
-    await llm_client.call(prompt="Z", model="gpt-4-turbo", alt_memory=m2)
+    await llm_client.call(prompt="A", model="gpt-4o", alt_memory=m1)
+    await llm_client.call(prompt="X", model="gpt-4o", alt_memory=m2)
+    await llm_client.call(prompt="B", model="gpt-4o", alt_memory=m1)
+    await llm_client.call(prompt="Y", model="gpt-4o", alt_memory=m2)
+    await llm_client.call(prompt="C", model="gpt-4o", alt_memory=m1)
+    await llm_client.call(prompt="Z", model="gpt-4o", alt_memory=m2)
 
     assert m1.unpack("role", "content", "user", "assistant") == [
         {"role": "user", "content": "A"},