Skip to content

Commit

Permalink
DH5526/fixing_the_malformed_llm_output (#420)
Browse files Browse the repository at this point in the history
  • Loading branch information
MohammadrezaPourreza authored Mar 5, 2024
1 parent faf07de commit 4190b4d
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 42 deletions.
3 changes: 2 additions & 1 deletion dataherald/sql_generator/dataherald_finetuning_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from dataherald.sql_generator import EngineTimeOutORItemLimitError, SQLGenerator
from dataherald.types import FineTuningStatus, Prompt, SQLGeneration
from dataherald.utils.agent_prompts import (
ERROR_PARSING_MESSAGE,
FINETUNING_AGENT_PREFIX,
FINETUNING_AGENT_PREFIX_FINETUNING_ONLY,
FINETUNING_AGENT_SUFFIX,
Expand Down Expand Up @@ -546,7 +547,7 @@ def generate_response(
max_execution_time=int(os.environ.get("DH_ENGINE_TIMEOUT", 150)),
)
agent_executor.return_intermediate_steps = True
agent_executor.handle_parsing_errors = True
agent_executor.handle_parsing_errors = ERROR_PARSING_MESSAGE
with get_openai_callback() as cb:
try:
result = agent_executor.invoke({"input": user_prompt.text})
Expand Down
3 changes: 2 additions & 1 deletion dataherald/sql_generator/dataherald_sqlagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from dataherald.types import Prompt, SQLGeneration
from dataherald.utils.agent_prompts import (
AGENT_PREFIX,
ERROR_PARSING_MESSAGE,
FORMAT_INSTRUCTIONS,
PLAN_BASE,
PLAN_WITH_FEWSHOT_EXAMPLES,
Expand Down Expand Up @@ -692,7 +693,7 @@ def generate_response(
max_execution_time=int(os.environ.get("DH_ENGINE_TIMEOUT", 150)),
)
agent_executor.return_intermediate_steps = True
agent_executor.handle_parsing_errors = True
agent_executor.handle_parsing_errors = ERROR_PARSING_MESSAGE
with get_openai_callback() as cb:
try:
result = agent_executor.invoke({"input": user_prompt.text})
Expand Down
91 changes: 56 additions & 35 deletions dataherald/utils/agent_prompts.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,81 @@
AGENT_PREFIX = """You are an agent designed to interact with a SQL database to find a correct SQL query for the given question.
Given an input question, generate a syntactically correct {dialect} query, execute the query to make sure it is correct, and return the SQL query in ```sql and ``` format.
You have access to tools for interacting with the database.
Given an input question, generate a syntactically correct {dialect} query, execute the query to make sure it is correct, and return the SQL query between ```sql and ``` tags.
You have access to tools for interacting with the database. You can use tools using Action: <tool_name> and Action Input: <tool_input> format.
Only use the below tools. Only use the information returned by the below tools to construct your final answer.
#
Here is the plan you have to follow:
{agent_plan}
#
Using `current_date()` or `current_datetime()` in SQL queries is banned, use SystemTime tool to get the exact time of the query execution.
If the question does not seem related to the database, return an empty string.
If the there is a very similar question among the fewshot examples, modify the SQL query to fit the given question and return the SQL query.
If the there is a very similar question among the fewshot examples, directly use the SQL query from the example and modify it to fit the given question and execute the query to make sure it is correct.
The SQL query MUST have in-line comments to explain what each clause does.
""" # noqa: E501

PLAN_WITH_FEWSHOT_EXAMPLES_AND_INSTRUCTIONS = """1) Use the FewshotExamplesRetriever tool to retrieve a set of possibly relevant tables and columns and the SQL syntax to use.
PLAN_WITH_FEWSHOT_EXAMPLES_AND_INSTRUCTIONS = """1) Use the FewshotExamplesRetriever tool to retrieve samples of Question/SQL pairs that are similar to the given question, if there is a similar question among the examples, use the SQL query from the example and modify it to fit the given question.
2) Use the GetAdminInstructions tool to retrieve the DB admin instructions before calling other tools, to make sure you follow the instructions when writing the SQL query.
3) Use the DbTablesWithRelevanceScores tool to find other possibly relevant tables.
3) Use the DbTablesWithRelevanceScores tool to find relevant tables.
4) Use the DbRelevantTablesSchema tool to obtain the schema of possibly relevant tables to identify the possibly relevant columns.
5) Use the DbRelevantColumnsInfo tool to gather more information about the possibly relevant columns, filtering them to find the relevant ones.
6) [Optional based on the question] Use the SystemTime tool if the question has any mentions of time or dates.
7) [Optional based on the question] Always use the DbColumnEntityChecker tool to make sure that relevant columns have the cell-values.
8) Write a {dialect} query and use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
7) For string columns, always use the DbColumnEntityChecker tool to make sure the entity values are present in the relevant columns.
8) Write a {dialect} query and always use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
#
Some tips to always keep in mind:
tip1) For complex questions request for more examples of Question/SQL pairs.
tip2) The maximum number of Question/SQL pairs you can request is {max_examples}.
tip3) If the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip4) Always call the get_admin_instructions tool before generating the SQL query, it will give you rules to follow when writing the SQL query.
tip5) The Question/SQL pairs are labelled as correct pairs, so you can use them to learn how to construct the SQL query.
tip6) If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
tip1) The maximum number of Question/SQL pairs you can request is {max_examples}.
tip2) After executing the query, if the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip3) Always call the GetAdminInstructions tool before generating the SQL query, it will give you rules to follow when writing the SQL query.
tip4) The Question/SQL pairs are labelled as correct pairs, so you can use them to answer the question and execute the query to make sure it is correct.
tip5) If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
tip6) The existance of the string values in the columns should always be checked using the DbColumnEntityChecker tool.
tip7) You should always execute the SQL query by calling the SqlDbQuery tool to make sure the results are correct.
""" # noqa: E501

PLAN_WITH_INSTRUCTIONS = """1) Use the DbTablesWithRelevanceScores tool to find the a set of possibly relevant tables.
PLAN_WITH_INSTRUCTIONS = """1) Use the DbTablesWithRelevanceScores tool to find relevant tables.
2) Use the GetAdminInstructions tool to retrieve the DB admin instructions before calling other tools, to make sure you follow the instructions when writing the SQL query.
2) Use the DbRelevantTablesSchema tool to obtain the schema of possibly relevant tables to identify the possibly relevant columns.
4) Use the DbRelevantColumnsInfo tool to gather more information about the possibly relevant columns, filtering them to find the relevant ones.
5) [Optional based on the question] Use the SystemTime tool if the question has any mentions of time or dates.
6) [Optional based on the question] Always use the DbColumnEntityChecker tool to make sure that relevant columns have the cell-values.
7) Write a {dialect} query and use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
6) For string columns, always use the DbColumnEntityChecker tool to make sure the entity values are present in the relevant columns.
7) Write a {dialect} query and always use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
#
Some tips to always keep in mind:
tip1) If the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip2) Always call the get_admin_instructions tool before generating the SQL query, it will give you rules to follow when writing the SQL query.
tip1) After executing the query, if the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip2) Always call the GetAdminInstructions tool before generating the SQL query, it will give you rules to follow when writing the SQL query.
tip3) If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
tip4) The existance of the string values in the columns should always be checked using the DbColumnEntityChecker tool.
tip5) You should always execute the SQL query by calling the SqlDbQuery tool to make sure the results are correct.
""" # noqa: E501

PLAN_WITH_FEWSHOT_EXAMPLES = """1) Use the FewshotExamplesRetriever tool to retrieve a set of possibly relevant tables and columns and the SQL syntax to use.
2) Use the DbTablesWithRelevanceScores tool to find other possibly relevant tables.
PLAN_WITH_FEWSHOT_EXAMPLES = """1) Use the FewshotExamplesRetriever tool to retrieve samples of Question/SQL pairs that are similar to the given question, if there is a similar question among the examples, use the SQL query from the example and modify it to fit the given question.
2) Use the DbTablesWithRelevanceScores tool to find relevant tables.
3) Use the DbRelevantTablesSchema tool to obtain the schema of possibly relevant tables to identify the possibly relevant columns.
4) Use the DbRelevantColumnsInfo tool to gather more information about the possibly relevant columns, filtering them to find the relevant ones.
5) [Optional based on the question] Use the SystemTime tool if the question has any mentions of time or dates.
6) [Optional based on the question] Always use the DbColumnEntityChecker tool to make sure that relevant columns have the cell-values.
7) Write a {dialect} query and use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
6) For string columns, always use the DbColumnEntityChecker tool to make sure the entity values are present in the relevant columns.
7) Write a {dialect} query and always use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
#
Some tips to always keep in mind:
tip1) For complex questions request for more examples of Question/SQL pairs.
tip2) The maximum number of Question/SQL pairs you can request is {max_examples}.
tip3) If the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip4) The Question/SQL pairs are labelled as correct pairs, so you can use them to learn how to construct the SQL query.
tip5) The Question/SQL pairs are labelled as correct pairs, so you can use them to learn how to construct the SQL query.
tip6) If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
tip1) The maximum number of Question/SQL pairs you can request is {max_examples}.
tip2) After executing the query, if the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip3) The Question/SQL pairs are labelled as correct pairs, so you can use them to answer the question and execute the query to make sure it is correct.
tip4) If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
tip5) The existance of the string values in the columns should always be checked using the DbColumnEntityChecker tool.
tip6) You should always execute the SQL query by calling the SqlDbQuery tool to make sure the results are correct.
""" # noqa: E501

PLAN_BASE = """1) Use the DbTablesWithRelevanceScores tool to find the a set of possibly relevant tables.
PLAN_BASE = """1) Use the DbTablesWithRelevanceScores tool to find relevant tables.
2) Use the DbRelevantTablesSchema tool to obtain the schema of possibly relevant tables to identify the possibly relevant columns.
3) Use the DbRelevantColumnsInfo tool to gather more information about the possibly relevant columns, filtering them to find the relevant ones.
4) [Optional based on the question] Use the SystemTime tool if the question has any mentions of time or dates.
5) [Optional based on the question] Always use the DbColumnEntityChecker tool to make sure that relevant columns have the cell-values.
6) Write a {dialect} query and use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
5) For string columns, always use the DbColumnEntityChecker tool to make sure the entity values are present in the relevant columns.
6) Write a {dialect} query and always use SqlDbQuery tool the Execute the SQL query on the database to check if the results are correct.
#
Some tips to always keep in mind:
tip1) If the SQL query resulted in errors or not correct results, rewrite the SQL query and try again.
tip2) If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
tip3) The existance of the string values in the columns should always be checked using the DbColumnEntityChecker tool.
tip4) You should always execute the SQL query by calling the SqlDbQuery tool to make sure the results are correct.
""" # noqa: E501

FORMAT_INSTRUCTIONS = """Use the following format:
Expand All @@ -87,13 +92,13 @@
SUFFIX_WITH_FEW_SHOT_SAMPLES = """Begin!
Question: {input}
Thought: I should Collect examples of Question/SQL pairs to identify possibly relevant tables, columns, and SQL query styles. If there is a similar question among the examples, I can use the SQL query from the example and modify it to fit the given question.
Thought: I should Collect examples of Question/SQL pairs to check if there is a similar question among the examples.
{agent_scratchpad}""" # noqa: E501

SUFFIX_WITHOUT_FEW_SHOT_SAMPLES = """Begin!
Question: {input}
Thought: I should find the a set of possibly relevant tables to the given question.
Thought: I should find the relevant tables.
{agent_scratchpad}"""

FINETUNING_SYSTEM_INFORMATION = """
Expand All @@ -116,7 +121,7 @@
If SQL results has None or NULL values, handle them by adding a WHERE clause to filter them out.
If SQL query doesn't follow the instructions or return incorrect results modify the SQL query to fit the instructions and fix the errors.
Only make minor modifications to the SQL query, do not change the SQL query completely.
You MUST use the execute_query tool to make sure the SQL query is correct before returning it.
You MUST always use the ExecuteQuery tool to make sure the SQL query is correct before returning it.
### Instructions from the database administrator:
{admin_instructions}
Expand All @@ -129,10 +134,26 @@
#
Here is the plan you have to follow:
1) Use the `GenerateSql` tool to generate a SQL query for the given question.
2) Use the `ExecuteQuery` tool to execute the SQL query on the database to check if the results are correct.
2) Always Use the `ExecuteQuery` tool to execute the SQL query on the database to check if the results are correct.
#
### Instructions from the database administrator:
{admin_instructions}
""" # noqa: E501

ERROR_PARSING_MESSAGE = """
ERROR: Parsing error, you should only use tools or return the final answer. You are a ReAct agent, you should not return any other format.
Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, one of the tools
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
If you know the final answer and do not need to use any tools, you can directly return the Final Answer: <your final answer>.
"""
4 changes: 3 additions & 1 deletion dataherald/vector_store/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def add_records(self, golden_sqls: List[GoldenSQL], collection: str):
collection,
[
{
"tables_used": Parser(golden_sql.sql).tables[0],
"tables_used": ", ".join(Parser(golden_sql.sql))
if isinstance(Parser(golden_sql.sql), list)
else "",
"db_connection_id": str(golden_sql.db_connection_id),
}
],
Expand Down
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dnspython==2.3.0
fastapi==0.98.0
httpx==0.24.1
langchain==0.1.5
langchain-community==0.0.18
langchain-openai==0.0.5
langchain==0.1.11
langchain-community==0.0.25
langchain-openai==0.0.8
load-dotenv==0.1.0
mypy-extensions==1.0.0
openai==1.11.1
Expand Down Expand Up @@ -36,7 +36,7 @@ sphinx-book-theme==1.0.1
boto3==1.28.38
botocore==1.31.38
PyAthena==3.0.6
tiktoken==0.5.2
tiktoken==0.6.0
duckdb-engine==0.9.1
duckdb==0.9.1
PyMySQL==1.1.0
Expand Down

0 comments on commit 4190b4d

Please sign in to comment.