From ca812cb3dab420e2fa377c30869b869e95b8f210 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 29 Dec 2023 11:07:25 +0100
Subject: [PATCH] fix: Replace deprecated `text-davinci-003` model with
 `gpt-3.5-turbo-instruct` model (#6660)

* replace davinci with 3.5 turbo instruct model

* fix max token limit and tokenizer in tests

* fix azure model support and scores in tests
---
 e2e/pipelines/test_pipeline_topologies.py               | 8 +-------
 e2e/pipelines/test_standard_pipelines.py                | 2 +-
 examples/web_lfqa.py                                    | 2 +-
 examples/web_qa.py                                      | 2 +-
 haystack/nodes/prompt/invocation_layer/azure_open_ai.py | 4 ++--
 haystack/nodes/prompt/invocation_layer/open_ai.py       | 2 +-
 haystack/nodes/sampler/top_p_sampler.py                 | 2 +-
 test/pipelines/test_standard_pipelines.py               | 2 +-
 test/prompt/conftest.py                                 | 4 ++--
 test/prompt/invocation_layer/test_openai.py             | 4 ++--
 test/prompt/test_prompt_node.py                         | 8 ++++----
 test/utils/test_openai_utils.py                         | 6 +++---
 12 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/e2e/pipelines/test_pipeline_topologies.py b/e2e/pipelines/test_pipeline_topologies.py
index 752191d34a..f2d1a1001e 100644
--- a/e2e/pipelines/test_pipeline_topologies.py
+++ b/e2e/pipelines/test_pipeline_topologies.py
@@ -178,13 +178,7 @@ def test_join_with_rrf(docs):
     results = p.run(query=query)
 
     # list of precalculated expected results
-    expected_scores = [
-        0.03278688524590164,
-        0.03200204813108039,
-        0.03200204813108039,
-        0.031009615384615385,
-        0.031009615384615385,
-    ]
+    expected_scores = [1.0, 0.9684979838709676, 0.9684979838709676, 0.9533577533577533, 0.9533577533577533]
     assert all(
         doc.score == pytest.approx(expected_scores[idx], abs=1e-3) for idx, doc in enumerate(results["documents"])
     )
diff --git a/e2e/pipelines/test_standard_pipelines.py b/e2e/pipelines/test_standard_pipelines.py
index f25ddcd13b..3ebc5a4702 100644
--- a/e2e/pipelines/test_standard_pipelines.py
+++ b/e2e/pipelines/test_standard_pipelines.py
@@ -207,7 +207,7 @@ def test_webqa_pipeline():
     search_key = os.environ.get("SERPERDEV_API_KEY")
     openai_key = os.environ.get("OPENAI_API_KEY")
     pn = PromptNode(
-        "text-davinci-003",
+        "gpt-3.5-turbo-instruct",
         api_key=openai_key,
         max_length=256,
         default_prompt_template="question-answering-with-document-scores",
diff --git a/examples/web_lfqa.py b/examples/web_lfqa.py
index ff5dbe15e0..cfdf81c602 100644
--- a/examples/web_lfqa.py
+++ b/examples/web_lfqa.py
@@ -21,7 +21,7 @@
 """
 
 prompt_node = PromptNode(
-    "text-davinci-003", default_prompt_template=PromptTemplate(prompt_text), api_key=openai_key, max_length=256
+    "gpt-3.5-turbo-instruct", default_prompt_template=PromptTemplate(prompt_text), api_key=openai_key, max_length=256
 )
 
 web_retriever = WebRetriever(api_key=search_key, top_search_results=5, mode="preprocessed_documents", top_k=30)
diff --git a/examples/web_qa.py b/examples/web_qa.py
index 352d2d226d..adc7f19dba 100644
--- a/examples/web_qa.py
+++ b/examples/web_qa.py
@@ -12,7 +12,7 @@
     raise ValueError("Please set the OPENAI_API_KEY environment variable")
 
 prompt_node = PromptNode(
-    "text-davinci-003",
+    "gpt-3.5-turbo-instruct",
     api_key=openai_key,
     max_length=256,
     default_prompt_template="question-answering-with-document-scores",
diff --git a/haystack/nodes/prompt/invocation_layer/azure_open_ai.py b/haystack/nodes/prompt/invocation_layer/azure_open_ai.py
index d10dc65463..001d6da8ba 100644
--- a/haystack/nodes/prompt/invocation_layer/azure_open_ai.py
+++ b/haystack/nodes/prompt/invocation_layer/azure_open_ai.py
@@ -19,7 +19,7 @@ def __init__(
         azure_deployment_name: str,
         api_key: str,
         api_version: str = "2022-12-01",
-        model_name_or_path: str = "text-davinci-003",
+        model_name_or_path: str = "gpt-3.5-turbo-instruct",
         max_length: Optional[int] = 100,
         **kwargs,
     ):
@@ -42,7 +42,7 @@ def supports(cls, model_name_or_path: str, **kwargs) -> bool:
         Ensures Azure OpenAI Invocation Layer is selected when `azure_base_url` and `azure_deployment_name` are provided in
         addition to a list of supported models.
         """
-        valid_model = model_name_or_path in ["ada", "babbage", "davinci", "curie"] or any(
+        valid_model = model_name_or_path in ["ada", "babbage", "davinci", "curie", "gpt-3.5-turbo-instruct"] or any(
             m in model_name_or_path for m in ["-ada-", "-babbage-", "-davinci-", "-curie-"]
         )
         return valid_model and has_azure_parameters(**kwargs)
diff --git a/haystack/nodes/prompt/invocation_layer/open_ai.py b/haystack/nodes/prompt/invocation_layer/open_ai.py
index 825da26234..0e26d709f8 100644
--- a/haystack/nodes/prompt/invocation_layer/open_ai.py
+++ b/haystack/nodes/prompt/invocation_layer/open_ai.py
@@ -33,7 +33,7 @@ class OpenAIInvocationLayer(PromptModelInvocationLayer):
     def __init__(
         self,
         api_key: str,
-        model_name_or_path: str = "text-davinci-003",
+        model_name_or_path: str = "gpt-3.5-turbo-instruct",
         max_length: Optional[int] = 100,
         api_base: str = "https://api.openai.com/v1",
         openai_organization: Optional[str] = None,
diff --git a/haystack/nodes/sampler/top_p_sampler.py b/haystack/nodes/sampler/top_p_sampler.py
index b77e448760..60d09d83a9 100644
--- a/haystack/nodes/sampler/top_p_sampler.py
+++ b/haystack/nodes/sampler/top_p_sampler.py
@@ -35,7 +35,7 @@ class TopPSampler(BaseSampler):
 
     ```python
     prompt_node = PromptNode(
-        "text-davinci-003",
+        "gpt-3.5-turbo-instruct",
         api_key=openai_key,
         max_length=256,
         default_prompt_template="question-answering-with-document-scores",
diff --git a/test/pipelines/test_standard_pipelines.py b/test/pipelines/test_standard_pipelines.py
index 2d6523d7a6..d8512c2fbd 100644
--- a/test/pipelines/test_standard_pipelines.py
+++ b/test/pipelines/test_standard_pipelines.py
@@ -79,7 +79,7 @@ def test_webqa_pipeline():
     search_key = os.environ.get("SERPERDEV_API_KEY")
     openai_key = os.environ.get("OPENAI_API_KEY")
     pn = PromptNode(
-        "text-davinci-003",
+        "gpt-3.5-turbo-instruct",
         api_key=openai_key,
         max_length=256,
         default_prompt_template="question-answering-with-document-scores",
diff --git a/test/prompt/conftest.py b/test/prompt/conftest.py
index 9d38e6d0dd..12b850207f 100644
--- a/test/prompt/conftest.py
+++ b/test/prompt/conftest.py
@@ -23,12 +23,12 @@ def prompt_model(request, haystack_azure_conf):
         api_key = os.environ.get("OPENAI_API_KEY", "KEY_NOT_FOUND")
         if api_key is None or api_key == "":
             api_key = "KEY_NOT_FOUND"
-        return PromptModel("text-davinci-003", api_key=api_key)
+        return PromptModel("gpt-3.5-turbo-instruct", api_key=api_key)
     elif request.param == "azure":
         api_key = os.environ.get("AZURE_OPENAI_API_KEY", "KEY_NOT_FOUND")
         if api_key is None or api_key == "":
             api_key = "KEY_NOT_FOUND"
-        return PromptModel("text-davinci-003", api_key=api_key, model_kwargs=haystack_azure_conf)
+        return PromptModel("gpt-3.5-turbo-instruct", api_key=api_key, model_kwargs=haystack_azure_conf)
     else:
         return PromptModel("google/flan-t5-base", devices=["cpu"])
 
diff --git a/test/prompt/invocation_layer/test_openai.py b/test/prompt/invocation_layer/test_openai.py
index 5ae3458788..63a47b31ad 100644
--- a/test/prompt/invocation_layer/test_openai.py
+++ b/test/prompt/invocation_layer/test_openai.py
@@ -53,7 +53,7 @@ def test_openai_token_limit_warning(mock_openai_tokenizer, caplog):
 @pytest.mark.parametrize(
     "model_name,max_tokens_limit",
     [
-        ("text-davinci-003", 4097),
+        ("gpt-3.5-turbo-instruct", 4096),
         ("gpt-3.5-turbo", 4096),
         ("gpt-3.5-turbo-16k", 16384),
         ("gpt-4-32k", 32768),
@@ -76,7 +76,7 @@ def test_openai_token_limit_warning_not_triggered(caplog, mock_openai_tokenizer,
 @pytest.mark.parametrize(
     "model_name,max_tokens_limit",
     [
-        ("text-davinci-003", 4097),
+        ("gpt-3.5-turbo-instruct", 4096),
         ("gpt-3.5-turbo", 4096),
         ("gpt-3.5-turbo-16k", 16384),
         ("gpt-4-32k", 32768),
diff --git a/test/prompt/test_prompt_node.py b/test/prompt/test_prompt_node.py
index 972a04be18..8a5d3459d5 100644
--- a/test/prompt/test_prompt_node.py
+++ b/test/prompt/test_prompt_node.py
@@ -216,7 +216,7 @@ def test_azure_vs_open_ai_invocation_layer_selection():
     node = PromptNode("gpt-4", api_key="some_key", model_kwargs=azure_model_kwargs)
     assert isinstance(node.prompt_model.model_invocation_layer, AzureChatGPTInvocationLayer)
 
-    node = PromptNode("text-davinci-003", api_key="some_key", model_kwargs=azure_model_kwargs)
+    node = PromptNode("gpt-3.5-turbo-instruct", api_key="some_key", model_kwargs=azure_model_kwargs)
     assert isinstance(node.prompt_model.model_invocation_layer, AzureOpenAIInvocationLayer)
 
     node = PromptNode("gpt-4", api_key="some_key")
@@ -224,7 +224,7 @@ def test_azure_vs_open_ai_invocation_layer_selection():
         node.prompt_model.model_invocation_layer, AzureChatGPTInvocationLayer
     )
 
-    node = PromptNode("text-davinci-003", api_key="some_key")
+    node = PromptNode("gpt-3.5-turbo-instruct", api_key="some_key")
     assert isinstance(node.prompt_model.model_invocation_layer, OpenAIInvocationLayer) and not isinstance(
         node.prompt_model.model_invocation_layer, AzureChatGPTInvocationLayer
     )
@@ -850,7 +850,7 @@ def test_complex_pipeline_with_all_features(tmp_path, haystack_openai_config):
             - name: pmodel_openai
               type: PromptModel
               params:
-                model_name_or_path: text-davinci-003
+                model_name_or_path: gpt-3.5-turbo-instruct
                 model_kwargs:
                   temperature: 0.9
                   max_tokens: 64
@@ -1052,7 +1052,7 @@ def test_content_moderation_gpt_3():
     OpenAIInvocationLayer.
     """
     prompt_node = PromptNode(
-        model_name_or_path="text-davinci-003", api_key="key", model_kwargs={"moderate_content": True}
+        model_name_or_path="gpt-3.5-turbo-instruct", api_key="key", model_kwargs={"moderate_content": True}
     )
     with patch("haystack.nodes.prompt.invocation_layer.open_ai.check_openai_policy_violation") as mock_check, patch(
         "haystack.nodes.prompt.invocation_layer.open_ai.openai_request"
diff --git a/test/utils/test_openai_utils.py b/test/utils/test_openai_utils.py
index 7126542f0c..92add5f219 100644
--- a/test/utils/test_openai_utils.py
+++ b/test/utils/test_openai_utils.py
@@ -22,9 +22,9 @@ def test_openai_text_completion_tokenization_details_gpt_default():
 
 @pytest.mark.unit
 def test_openai_text_completion_tokenization_details_gpt_davinci():
-    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="text-davinci-003")
-    assert tokenizer_name == "p50k_base"
-    assert max_tokens_limit == 4097
+    tokenizer_name, max_tokens_limit = _openai_text_completion_tokenization_details(model_name="gpt-3.5-turbo-instruct")
+    assert tokenizer_name == "cl100k_base"
+    assert max_tokens_limit == 4096
 
 
 @pytest.mark.unit