diff --git a/requirements.txt b/requirements.txt index e99cb69..dcf4333 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ spacy==3.7.4 tiktoken==0.4.0 psycopg2-binary==2.9.7 # you can also install from source if it works pglast==5.3 -FlagEmbedding==1.2.5 \ No newline at end of file +FlagEmbedding==1.2.5 +litellm==1.34.34 \ No newline at end of file diff --git a/src/suql/prompt_continuation.py b/src/suql/prompt_continuation.py index 6e204e1..2fa5031 100644 --- a/src/suql/prompt_continuation.py +++ b/src/suql/prompt_continuation.py @@ -7,9 +7,6 @@ from functools import partial from typing import List -import openai -from openai import OpenAI - import os import time import traceback @@ -19,6 +16,8 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape from suql.utils import num_tokens_from_string +from litellm import completion, completion_cost + logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -41,55 +40,17 @@ mongo_client = pymongo.MongoClient("localhost", 27017) prompt_cache_db = mongo_client["open_ai_prompts"]["caches"] -# inference_cost_per_1000_tokens = {'ada': 0.0004, 'babbage': 0.0005, 'curie': 0.002, 'davinci': 0.02, 'turbo': 0.003, 'gpt-4': 0.03} # for Azure -inference_input_cost_per_1000_tokens = { - "gpt-4": 0.03, - "gpt-3.5-turbo-0613": 0.0010, - "gpt-3.5-turbo-1106": 0.0010, - "gpt-4-1106-preview": 0.01, -} # for OpenAI -inference_output_cost_per_1000_tokens = { - "gpt-4": 0.06, - "gpt-3.5-turbo-0613": 0.0010, - "gpt-3.5-turbo-1106": 0.0020, - "gpt-4-1106-preview": 0.03, -} # for OpenAI -total_cost = 0 # in USD - +total_cost = 0 # in USD def get_total_cost(): global total_cost return total_cost -def _model_name_to_cost(model_name: str) -> float: - if ( - model_name in inference_input_cost_per_1000_tokens - and model_name in inference_output_cost_per_1000_tokens - ): - return ( - inference_input_cost_per_1000_tokens[model_name], - inference_output_cost_per_1000_tokens[model_name], - ) - raise ValueError("Did not recognize GPT model name %s" % model_name) - - -def openai_chat_completion_with_backoff(**kwargs): - client = OpenAI() - # # uncomment if using Azure OpenAI - openai.api_type == "open_ai" - # openai.api_type = "azure" - # openai.api_base = "https://ovalopenairesource.openai.azure.com/" - # openai.api_version = "2023-05-15" +def chat_completion_with_backoff(**kwargs): global total_cost - ret = client.chat.completions.create(**kwargs) - num_prompt_tokens = ret.usage.prompt_tokens - num_completion_tokens = ret.usage.completion_tokens - prompt_cost, completion_cost = _model_name_to_cost(kwargs["model"]) - total_cost += ( - num_prompt_tokens / 1000 * prompt_cost - + num_completion_tokens / 1000 * completion_cost - ) # TODO: update this + ret = completion(**kwargs) + total_cost += completion_cost(ret) return ret.choices[0].message.content @@ -136,28 +97,19 @@ def _generate( "presence_penalty": presence_penalty, "stop": stop_tokens, } - if openai.api_type == "azure": - kwargs.update({"engine": engine}) - else: - engine_model_map = { - "gpt-4": "gpt-4", - "gpt-35-turbo": "gpt-3.5-turbo-1106", - "gpt-3.5-turbo": "gpt-3.5-turbo-1106", - "gpt-4-turbo": "gpt-4-1106-preview", + engine_model_map = { + "gpt-4": "gpt-4", + "gpt-35-turbo": "gpt-3.5-turbo-1106", + "gpt-3.5-turbo": "gpt-3.5-turbo-1106", + "gpt-4-turbo": "gpt-4-1106-preview", + } + kwargs.update( + { + "model": engine_model_map[engine] if engine in engine_model_map else engine } - # https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models - # https://platform.openai.com/docs/models/model-endpoint-compatibility - kwargs.update( - { - "model": ( - engine_model_map[engine] - if engine in engine_model_map - else engine - ) - } - ) - - generation_output = openai_chat_completion_with_backoff(**kwargs) + ) + + generation_output = chat_completion_with_backoff(**kwargs) generation_output = no_line_break_start + generation_output logger.info("LLM output = %s", generation_output)