migrate to litellm

stanford-oval · Apr 8, 2024 · f4cff53 · f4cff53
1 parent a741184
commit f4cff53
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 67 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,5 @@ spacy==3.7.4
 tiktoken==0.4.0
 psycopg2-binary==2.9.7 # you can also install from source if it works
 pglast==5.3
-FlagEmbedding==1.2.5
+FlagEmbedding==1.2.5
+litellm==1.34.34
diff --git a/src/suql/prompt_continuation.py b/src/suql/prompt_continuation.py
@@ -7,9 +7,6 @@
 from functools import partial
 from typing import List
 
-import openai
-from openai import OpenAI
-
 import os
 import time
 import traceback
@@ -19,6 +16,8 @@
 from jinja2 import Environment, FileSystemLoader, select_autoescape
 
 from suql.utils import num_tokens_from_string
+from litellm import completion, completion_cost
+
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -41,55 +40,17 @@
     mongo_client = pymongo.MongoClient("localhost", 27017)
     prompt_cache_db = mongo_client["open_ai_prompts"]["caches"]
 
-# inference_cost_per_1000_tokens = {'ada': 0.0004, 'babbage': 0.0005, 'curie': 0.002, 'davinci': 0.02, 'turbo': 0.003, 'gpt-4': 0.03} # for Azure
-inference_input_cost_per_1000_tokens = {
-    "gpt-4": 0.03,
-    "gpt-3.5-turbo-0613": 0.0010,
-    "gpt-3.5-turbo-1106": 0.0010,
-    "gpt-4-1106-preview": 0.01,
-}  # for OpenAI
-inference_output_cost_per_1000_tokens = {
-    "gpt-4": 0.06,
-    "gpt-3.5-turbo-0613": 0.0010,
-    "gpt-3.5-turbo-1106": 0.0020,
-    "gpt-4-1106-preview": 0.03,
-}  # for OpenAI
-total_cost = 0  # in USD
-
 
+total_cost = 0  # in USD
 def get_total_cost():
     global total_cost
     return total_cost
 
 
-def _model_name_to_cost(model_name: str) -> float:
-    if (
-        model_name in inference_input_cost_per_1000_tokens
-        and model_name in inference_output_cost_per_1000_tokens
-    ):
-        return (
-            inference_input_cost_per_1000_tokens[model_name],
-            inference_output_cost_per_1000_tokens[model_name],
-        )
-    raise ValueError("Did not recognize GPT model name %s" % model_name)
-
-
-def openai_chat_completion_with_backoff(**kwargs):
-    client = OpenAI()
-    # # uncomment if using Azure OpenAI
-    openai.api_type == "open_ai"
-    # openai.api_type = "azure"
-    # openai.api_base = "https://ovalopenairesource.openai.azure.com/"
-    # openai.api_version = "2023-05-15"
+def chat_completion_with_backoff(**kwargs):
     global total_cost
-    ret = client.chat.completions.create(**kwargs)
-    num_prompt_tokens = ret.usage.prompt_tokens
-    num_completion_tokens = ret.usage.completion_tokens
-    prompt_cost, completion_cost = _model_name_to_cost(kwargs["model"])
-    total_cost += (
-        num_prompt_tokens / 1000 * prompt_cost
-        + num_completion_tokens / 1000 * completion_cost
-    )  # TODO: update this
+    ret = completion(**kwargs)
+    total_cost += completion_cost(ret)
     return ret.choices[0].message.content
 
 
@@ -136,28 +97,19 @@ def _generate(
             "presence_penalty": presence_penalty,
             "stop": stop_tokens,
         }
-        if openai.api_type == "azure":
-            kwargs.update({"engine": engine})
-        else:
-            engine_model_map = {
-                "gpt-4": "gpt-4",
-                "gpt-35-turbo": "gpt-3.5-turbo-1106",
-                "gpt-3.5-turbo": "gpt-3.5-turbo-1106",
-                "gpt-4-turbo": "gpt-4-1106-preview",
+        engine_model_map = {
+            "gpt-4": "gpt-4",
+            "gpt-35-turbo": "gpt-3.5-turbo-1106",
+            "gpt-3.5-turbo": "gpt-3.5-turbo-1106",
+            "gpt-4-turbo": "gpt-4-1106-preview",
+        }
+        kwargs.update(
+            {
+                "model": engine_model_map[engine] if engine in engine_model_map else engine
             }
-            # https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
-            # https://platform.openai.com/docs/models/model-endpoint-compatibility
-            kwargs.update(
-                {
-                    "model": (
-                        engine_model_map[engine]
-                        if engine in engine_model_map
-                        else engine
-                    )
-                }
-            )
-
-        generation_output = openai_chat_completion_with_backoff(**kwargs)
+        )
+
+        generation_output = chat_completion_with_backoff(**kwargs)
         generation_output = no_line_break_start + generation_output
         logger.info("LLM output = %s", generation_output)